[llvm] [AMDGPU] Add intrinsics and MIs for ds_bvh_stack_* (PR #130007)
Mariusz Sikora via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 00:27:23 PST 2025
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/130007
>From d7a76b701125a0b6dae8f5e60749a20ed6337e05 Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic <mateja.marjanovic at amd.com>
Date: Sun, 2 Mar 2025 16:01:03 -0500
Subject: [PATCH 1/2] [AMDGPU] Add intrinsics and MIs for ds_bvh_stack_*
New intrinsics / instructions :
int_amdgcn_ds_bvh_stack_push4_pop1_rtn / ds_bvh_stack_push4_pop1_rtn_b32
int_amdgcn_ds_bvh_stack_push8_pop1_rtn / ds_bvh_stack_push8_pop1_rtn_b32
int_amdgcn_ds_bvh_stack_push8_pop2_rtn / ds_bvh_stack_push8_pop2_rtn_b64
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 30 +++++-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 21 +++-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 19 +++-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 5 +-
llvm/lib/Target/AMDGPU/DSInstructions.td | 24 ++++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +-
.../llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll | 98 +++++++++++++++++++
llvm/test/MC/AMDGPU/gfx12_asm_ds.s | 18 ++++
.../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 24 +++++
10 files changed, 234 insertions(+), 12 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..f717a87342642 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2714,7 +2714,7 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
-def int_amdgcn_ds_bvh_stack_rtn :
+class IntDSBVHStackRtn :
Intrinsic<
[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
[
@@ -2726,6 +2726,8 @@ def int_amdgcn_ds_bvh_stack_rtn :
[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+def int_amdgcn_ds_bvh_stack_rtn : IntDSBVHStackRtn;
+
def int_amdgcn_s_wait_event_export_ready :
ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
@@ -2801,6 +2803,32 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
// GFX12 Intrinsics
//===----------------------------------------------------------------------===//
+def int_amdgcn_ds_bvh_stack_push4_pop1_rtn : IntDSBVHStackRtn;
+
+def int_amdgcn_ds_bvh_stack_push8_pop1_rtn :
+ Intrinsic<
+ [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
+ [
+ llvm_i32_ty, // %addr
+ llvm_i32_ty, // %data0
+ llvm_v8i32_ty, // %data1
+ llvm_i32_ty, // %offset
+ ],
+ [ImmArg<ArgIndex<3>>, IntrWillReturn]
+ >;
+
+def int_amdgcn_ds_bvh_stack_push8_pop2_rtn :
+ Intrinsic<
+ [llvm_i64_ty, llvm_i32_ty], // %vdst, %addr
+ [
+ llvm_i32_ty, // %addr
+ llvm_i32_ty, // %data0
+ llvm_v8i32_ty, // %data1
+ llvm_i32_ty, // %offset
+ ],
+ [ImmArg<ArgIndex<3>>, IntrWillReturn]
+ >;
+
// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
Intrinsic<[llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e90754103ff1..e93a401ee20fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2665,8 +2665,20 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
-void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
- unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+ break;
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
+ break;
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
+ break;
+ }
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
N->getOperand(5), N->getOperand(0)};
@@ -2830,7 +2842,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
return;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
- SelectDSBvhStackIntrinsic(N);
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+ SelectDSBvhStackIntrinsic(N, IntrID);
return;
case Intrinsic::amdgcn_init_whole_wave:
CurDAG->getMachineFunction()
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7dcd208a9cdd4..f3b9364fdb92b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -267,7 +267,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectFMAD_FMA(SDNode *N);
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
- void SelectDSBvhStackIntrinsic(SDNode *N);
+ void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index eb781cbd1c8da..8822b8ab7b8ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2266,7 +2266,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
Register Data1 = MI.getOperand(5).getReg();
unsigned Offset = MI.getOperand(6).getImm();
- auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
+ unsigned Opc;
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+ break;
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
+ break;
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
+ Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
+ break;
+ }
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
.addDef(Dst1)
.addUse(Addr)
.addUse(Data0)
@@ -2321,6 +2335,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
}
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
return selectDSBvhStackIntrinsic(I);
case Intrinsic::amdgcn_s_barrier_signal_var:
return selectNamedBarrierInit(I, IntrinsicID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d79200c319b65..5f85836510c34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5252,7 +5252,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
- case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
OpdsMapping[0] =
getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
OpdsMapping[1] =
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d3487daee364f..b575f2bafa18e 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -258,10 +258,13 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
}
}
-class DS_BVH_STACK<string opName>
+class DS_BVH_STACK<string opName,
+ RegisterClass rc,
+ RegisterClass src>
: DS_Pseudo<opName,
- (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
- (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, Offset:$offset),
+ (outs getLdStRegisterOperand<rc>.ret:$vdst, VGPR_32:$addr),
+ (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0,
+ src:$data1, Offset:$offset),
" $vdst, $addr, $data0, $data1$offset"> {
let Constraints = "$addr = $addr_in";
let DisableEncoding = "$addr_in";
@@ -722,7 +725,8 @@ def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
let SubtargetPredicate = isGFX11Plus in {
let OtherPredicates = [HasImageInsts] in
-def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
+def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32",
+ VGPR_32, VReg_128> ;
} // let SubtargetPredicate = isGFX11Plus
@@ -732,6 +736,13 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
let SubtargetPredicate = isGFX12Plus in {
+let OtherPredicates = [HasImageInsts] in {
+def DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_BVH_STACK<
+ "ds_bvh_stack_push8_pop1_rtn_b32", VGPR_32, VReg_256>;
+def DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_BVH_STACK<
+ "ds_bvh_stack_push8_pop2_rtn_b64", VReg_64, VReg_256>;
+} // End OtherPredicates = [HasImageInsts].
+
defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
@@ -1268,6 +1279,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>;
+defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
+ "ds_bvh_stack_push4_pop1_rtn_b32", true>;
+defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
+defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
+
// New aliases added in GFX12 without renaming the instructions.
let AssemblerPredicate = isGFX12Plus in {
def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fe095414e5172..76cc687ddc125 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1457,7 +1457,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
+ case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
const GCNTargetMachine &TM =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
new file mode 100644
index 0000000000000..e46bdae883c2b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+
+declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32, i32, <4 x i32>, i32 immarg)
+declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32, i32, <8 x i32>, i32 immarg)
+declare { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32, i32, <8 x i32>, i32 immarg)
+declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1)
+
+define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push4_pop1_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8 x i32> %data1) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop1_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9]
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+ %vdst = extractvalue { i64, i32 } %pair, 0
+ %newaddr = extractvalue { i64, i32 } %pair, 1
+ %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
+ %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
+ %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8 x i32> %data1, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop2_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v0, v1, v[2:9] offset:1
+; CHECK-NEXT: s_wait_dscnt 0x0
+; CHECK-NEXT: export prim v1, off, off, off done
+; CHECK-NEXT: s_endpgm
+ %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
+ %vdst = extractvalue { i64, i32 } %pair, 0
+ %newaddr = extractvalue { i64, i32 } %pair, 1
+ %vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
+ %vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
+ %vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
index 34c42affdd46c..364463f9404bc 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
@@ -1922,3 +1922,21 @@ ds_bpermute_fi_b32 v5, v1, v2 offset:0
ds_bpermute_fi_b32 v255, v255, v255 offset:4
// GFX12: encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
+
+ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5]
+// GFX12: encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1
+// GFX12: encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9]
+// GFX12: encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1
+// GFX12: encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+
+ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251]
+// GFX12: encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+
+ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127
+// GFX12: encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index d66748135ffd4..d9381b50ca29f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -3242,3 +3242,27 @@
# GFX12: ds_bpermute_fi_b32 v255, v255, v255 offset:4 ; encoding: [0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff]
0x04,0x00,0x34,0xdb,0xff,0xff,0x00,0xff
+
+# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] ; encoding: [0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+0x00,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[2:5] offset:1 ; encoding: [0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01]
+0x01,0x00,0x80,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] ; encoding: [0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+0x00,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[2:9] offset:1 ; encoding: [0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01]
+0x01,0x00,0x84,0xdb,0x00,0x01,0x02,0x01
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] ; encoding: [0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+0x00,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[254:255], v253, v252, v[244:251] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe]
+0x7f,0x00,0x88,0xdb,0xfd,0xfc,0xf4,0xfe
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] offset:127 ; encoding: [0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
+0x7f,0x00,0x88,0xdb,0x03,0x04,0x05,0x01
+
+# GFX12: ds_bvh_stack_push8_pop2_rtn_b64 v[1:2], v3, v4, v[5:12] ; encoding: [0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01]
+0x00,0x00,0x88,0xdb,0x03,0x04,0x05,0x01
>From 97bf706de1417fbfcad00cb9640af1a4784d700d Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 6 Mar 2025 03:27:01 -0500
Subject: [PATCH 2/2] Use poison instead of undef
---
.../AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
index e46bdae883c2b..44971e7304dbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.push.pop.rtn.ll
@@ -17,7 +17,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data0, <4 x
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}
@@ -31,7 +31,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push4_pop1_1(i32 %addr, i32 %data0, <4
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}
@@ -45,7 +45,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data0, <8 x
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}
@@ -59,7 +59,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop1_1(i32 %addr, i32 %data0, <8
%pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data0, <8 x i32> %data1, i32 1)
%vdst = extractvalue { i32, i32 } %pair, 0
%newaddr = extractvalue { i32, i32 } %pair, 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 undef, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst, i32 %newaddr, i32 poison, i32 poison, i1 true, i1 false)
ret void
}
@@ -76,7 +76,7 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data0, <8 x
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
ret void
}
@@ -93,6 +93,6 @@ define amdgpu_gs void @test_ds_bvh_stack_push8_pop2_1(i32 %addr, i32 %data0, <8
%vdst.v2i32 = bitcast i64 %vdst to <2 x i32>
%vdst.lo = extractelement <2 x i32> %vdst.v2i32, i32 0
%vdst.hi = extractelement <2 x i32> %vdst.v2i32, i32 1
- call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 undef, i1 true, i1 false)
+ call void @llvm.amdgcn.exp.i32(i32 20, i32 1, i32 %vdst.lo, i32 %vdst.hi, i32 %newaddr, i32 poison, i1 true, i1 false)
ret void
}
More information about the llvm-commits
mailing list