[llvm] [AMDGPU] Add intrinsic and MI for image_bvh_dual_intersect_ray (PR #130038)
Mariusz Sikora via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 16 13:49:40 PDT 2025
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/130038
>From c72e04d53da1e77199af7cc02d905d61c6562683 Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic <mateja.marjanovic at amd.com>
Date: Mon, 3 Mar 2025 04:13:00 -0500
Subject: [PATCH 1/2] [AMDGPU] Add intrinsic and MI for
image_bvh_dual_intersect_ray
- Add llvm.amdgcn.image.bvh.dual.intersect.ray intrinsic and
image_bvh_dual_intersect_ray machine instruction.
- Add llvm_v10i32_ty and llvm_v10f32_ty
---
llvm/include/llvm/IR/Intrinsics.td | 2 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +++
.../AMDGPU/AMDGPUInstructionSelector.cpp | 9 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 57 ++++++++++++
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 2 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 35 ++++++--
.../Disassembler/AMDGPUDisassembler.cpp | 1 +
llvm/lib/Target/AMDGPU/MIMGInstructions.td | 46 ++++++----
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 ++++++++++-
llvm/lib/Target/AMDGPU/SIInstructions.td | 10 ++-
.../AMDGPU/llvm.amdgcn.dual_intersect_ray.ll | 87 +++++++++++++++++++
llvm/test/MC/AMDGPU/gfx12_asm_vimage.s | 3 +
llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s | 3 +
.../Disassembler/AMDGPU/gfx12_dasm_vimage.txt | 3 +
14 files changed, 289 insertions(+), 29 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 62239ca705b9e..645591dc4c776 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -561,6 +561,7 @@ def llvm_v3i32_ty : LLVMType<v3i32>; // 3 x i32
def llvm_v4i32_ty : LLVMType<v4i32>; // 4 x i32
def llvm_v6i32_ty : LLVMType<v6i32>; // 6 x i32
def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
+def llvm_v10i32_ty : LLVMType<v10i32>; // 10 x i32
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
@@ -591,6 +592,7 @@ def llvm_v2f32_ty : LLVMType<v2f32>; // 2 x float
def llvm_v3f32_ty : LLVMType<v3f32>; // 3 x float
def llvm_v4f32_ty : LLVMType<v4f32>; // 4 x float
def llvm_v8f32_ty : LLVMType<v8f32>; // 8 x float
+def llvm_v10f32_ty : LLVMType<v10f32>; // 10 x float
def llvm_v16f32_ty : LLVMType<v16f32>; // 16 x float
def llvm_v32f32_ty : LLVMType<v32f32>; // 32 x float
def llvm_v1f64_ty : LLVMType<v1f64>; // 1 x double
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..f93439b30523e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2801,6 +2801,17 @@ def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty
// GFX12 Intrinsics
//===----------------------------------------------------------------------===//
+// <vdata>, <ray_origin>, <ray_dir>
+// llvm.amdgcn.image.bvh.dual.intersect.ray <node_ptr>, <ray_extent>,
+// <instance_mask>, <ray_origin>,
+// <ray_dir>, <offsets>,
+// <texture_descr>
+def int_amdgcn_image_bvh_dual_intersect_ray :
+ Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty],
+ [llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty,
+ llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty],
+ [IntrReadMem, IntrWillReturn]>;
+
// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
Intrinsic<[llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2ee82381c4ef0..0f9096d4cec5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3580,10 +3580,12 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
MachineInstr &MI) const {
- MI.setDesc(TII.get(MI.getOperand(1).getImm()));
- MI.removeOperand(1);
+ unsigned OpcodeOpIdx =
+ MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
+ MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
+ MI.removeOperand(OpcodeOpIdx);
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
- return true;
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
}
// FIXME: This should be removed and let the patterns select. We just need the
@@ -4097,6 +4099,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
assert(Intr && "not an image intrinsic with image pseudo");
return selectImageIntrinsic(I, Intr);
}
+ case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
return selectBVHIntersectRayIntrinsic(I);
case AMDGPU::G_SBFX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..cd0554a5c5b99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7183,6 +7183,61 @@ bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
return true;
}
+bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ const LLT S32 = LLT::scalar(32);
+ const LLT V2S32 = LLT::fixed_vector(2, 32);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register DstOrigin = MI.getOperand(1).getReg();
+ Register DstDir = MI.getOperand(2).getReg();
+ Register NodePtr = MI.getOperand(4).getReg();
+ Register RayExtent = MI.getOperand(5).getReg();
+ Register InstanceMask = MI.getOperand(6).getReg();
+ Register RayOrigin = MI.getOperand(7).getReg();
+ Register RayDir = MI.getOperand(8).getReg();
+ Register Offsets = MI.getOperand(9).getReg();
+ Register TDescr = MI.getOperand(10).getReg();
+
+ if (!AMDGPU::isGFX12Plus(ST)) {
+ DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
+ "intrinsic not supported on subtarget",
+ MI.getDebugLoc());
+ B.getMF().getFunction().getContext().diagnose(BadIntrin);
+ return false;
+ }
+
+ const unsigned NumVDataDwords = 10;
+ const unsigned NumVAddrDwords = 12;
+ int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
+ AMDGPU::MIMGEncGfx12, NumVDataDwords,
+ NumVAddrDwords);
+ assert(Opcode != -1);
+
+ SmallVector<Register, 12> Ops;
+ Ops.push_back(NodePtr);
+ Ops.push_back(B.buildMergeLikeInstr(
+ V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)})
+ .getReg(0));
+ Ops.push_back(RayOrigin);
+ Ops.push_back(RayDir);
+ Ops.push_back(Offsets);
+
+ auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
+ .addDef(DstReg)
+ .addDef(DstOrigin)
+ .addDef(DstDir)
+ .addImm(Opcode);
+
+ for (Register R : Ops)
+ MIB.addUse(R);
+
+ MIB.addUse(TDescr).cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
MachineIRBuilder &B) const {
const SITargetLowering *TLI = ST.getTargetLowering();
@@ -7531,6 +7586,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeRsqClampIntrinsic(MI, MRI, B);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntersectRayIntrinsic(MI, B);
+ case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
+ return legalizeBVHDualIntrinsic(MI, B);
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index a98e8ba7aaaf1..aba1f55330913 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -208,6 +208,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI,
MachineIRBuilder &B) const;
+ bool legalizeBVHDualIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
+
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c19ee14ab1574..ca74e45338744 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3217,10 +3217,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
return;
}
- case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: {
- unsigned N = MI.getNumExplicitOperands() - 2;
+ case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
+ case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
+ bool IsDual = MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY;
+ unsigned NumMods = IsDual ? 0 : 1; // Has A16 modifier
+ unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(B, MI, {N});
+ executeInWaterfallLoop(B, MI, {LastRegOpIdx});
return;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
@@ -5010,11 +5013,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
assert(RSrcIntrin->IsImage);
return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
}
- case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: {
- unsigned N = MI.getNumExplicitOperands() - 2;
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
- OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
- if (N == 3) {
+ case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
+ case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: {
+ bool IsDual = MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY;
+ unsigned NumMods = IsDual ? 0 : 1; // Has A16 modifier
+ unsigned LastRegOpIdx = MI.getNumExplicitOperands() - 1 - NumMods;
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ if (IsDual) {
+ OpdsMapping[1] = AMDGPU::getValueMapping(
+ AMDGPU::VGPRRegBankID,
+ MRI.getType(MI.getOperand(1).getReg()).getSizeInBits());
+ OpdsMapping[2] = AMDGPU::getValueMapping(
+ AMDGPU::VGPRRegBankID,
+ MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
+ }
+ OpdsMapping[LastRegOpIdx] =
+ getSGPROpMapping(MI.getOperand(LastRegOpIdx).getReg(), MRI, *TRI);
+ if (LastRegOpIdx == 3) {
// Sequential form: all operands combined into VGPR256/VGPR512
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
if (Size > 256)
@@ -5022,7 +5038,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
} else {
// NSA form
- for (unsigned I = 2; I < N; ++I) {
+ unsigned FirstSrcOpIdx = IsDual ? 4 : 2;
+ for (unsigned I = FirstSrcOpIdx; I < LastRegOpIdx; ++I) {
unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 728ce125eba2d..847121f251361 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -268,6 +268,7 @@ DECODE_OPERAND_REG_8(VReg_128)
DECODE_OPERAND_REG_8(VReg_192)
DECODE_OPERAND_REG_8(VReg_256)
DECODE_OPERAND_REG_8(VReg_288)
+DECODE_OPERAND_REG_8(VReg_320)
DECODE_OPERAND_REG_8(VReg_352)
DECODE_OPERAND_REG_8(VReg_384)
DECODE_OPERAND_REG_8(VReg_512)
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 1b94d6c43392d..63af4b2e351fb 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1509,7 +1509,7 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
: MIMG_Gather<op, sample, 1>;
-class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
+class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual> {
int num_addrs = !if(Is64, !if(IsA16, 9, 12), !if(IsA16, 8, 11));
RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
int VAddrDwords = !srl(RegClass.Size, 5);
@@ -1517,9 +1517,10 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
list<RegisterClass> GFX11PlusAddrTypes =
- !if(IsA16,
+ !if(isDual, [VReg_64, VReg_64, VReg_96, VReg_96, VReg_64],
+ !if(IsA16,
[node_ptr_type, VGPR_32, VReg_96, VReg_96],
- [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+ [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]));
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
@@ -1553,15 +1554,26 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
}
class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
+ bit isDual,
list<RegisterClass> addr_types>
- : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata),
+ : VIMAGE_gfx12<op.GFX12, !if(isDual,
+ (outs VReg_320:$vdata, VReg_96:$ray_origin_out,
+ VReg_96:$ray_dir_out),
+ (outs VReg_128:$vdata)),
num_addrs, "GFX12", addr_types> {
- let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16));
- let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16";
-}
-
-multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
- defvar info = MIMG_IntersectRay_Helper<Is64, IsA16>;
+ let Constraints = !if(isDual,
+ "$ray_origin_out = $vaddr2, $ray_dir_out = $vaddr3", "");
+ let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc),
+ !if(isDual, (ins), (ins A16:$a16)));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc"#
+ !if(isDual, "", "$a16");
+ let SchedRW = !if(isDual,
+ [WriteVMEM, WriteVMEM, WriteVMEM], [WriteVMEM]);
+}
+
+multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16,
+ bit isDual> {
+ defvar info = MIMG_IntersectRay_Helper<Is64, IsA16, isDual>;
def "" : MIMGBaseOpcode {
let BVH = 1;
let A16 = IsA16;
@@ -1599,7 +1611,8 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
}
}
def _gfx12 : VIMAGE_IntersectRay_gfx12<op, opcode, info.GFX11PlusNSAAddrs,
- info.GFX11PlusAddrTypes> {
+ isDual, info.GFX11PlusAddrTypes> {
+ let VDataDwords = !if(isDual, 10, 4);
let VAddrDwords = info.num_addrs;
}
}
@@ -1771,15 +1784,18 @@ defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x80>, "ima
let OtherPredicates = [HasImageInsts, HasGFX10_AEncoding] in {
defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, 0x18, MIMG.NOP>, "image_msaa_load">;
-defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1, 0>;
} // End OtherPredicates = [HasImageInsts, HasGFX10_AEncoding]
+defm IMAGE_BVH_DUAL_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x80, MIMG.NOP, MIMG.NOP>, "image_bvh_dual_intersect_ray", 1, 0, 1>;
+
let SubtargetPredicate = isGFX12Plus in {
def : AMDGPUMnemonicAlias<"bvh_intersect_ray", "image_bvh_intersect_ray">;
def : AMDGPUMnemonicAlias<"bvh64_intersect_ray", "image_bvh64_intersect_ray">;
+ def : AMDGPUMnemonicAlias<"bvh_dual_intersect_ray", "image_bvh_dual_intersect_ray">;
}
} // End let OtherPredicates = [HasImageInsts]
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9743320601ed4..c5166c459d7ac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1386,9 +1386,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
+ Info.memVT =
+ MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
+ ? CI.getType()
+ : cast<StructType>(CI.getType())
+ ->getElementType(0)); // XXX: what is correct VT?
Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
Info.align.reset();
@@ -9435,6 +9440,48 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, VT,
M->getMemOperand());
}
+ case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue NodePtr = M->getOperand(2);
+ SDValue RayExtent = M->getOperand(3);
+ SDValue InstanceMask = M->getOperand(4);
+ SDValue RayOrigin = M->getOperand(5);
+ SDValue RayDir = M->getOperand(6);
+ SDValue Offsets = M->getOperand(7);
+ SDValue TDescr = M->getOperand(8);
+
+ assert(NodePtr.getValueType() == MVT::i64);
+ assert(RayDir.getValueType() == MVT::v3f32);
+
+ if (!AMDGPU::isGFX12Plus(*Subtarget)) {
+ emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
+ return SDValue();
+ }
+
+ const unsigned NumVDataDwords = 10;
+ const unsigned NumVAddrDwords = 12;
+ int Opcode = AMDGPU::getMIMGOpcode(AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
+ AMDGPU::MIMGEncGfx12, NumVDataDwords,
+ NumVAddrDwords);
+ assert(Opcode != -1);
+
+ SmallVector<SDValue, 16> Ops;
+ Ops.push_back(NodePtr);
+ Ops.push_back(DAG.getBuildVector(
+ MVT::v2i32, DL,
+ {DAG.getBitcast(MVT::i32, RayExtent),
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
+ Ops.push_back(RayOrigin);
+ Ops.push_back(RayDir);
+ Ops.push_back(Offsets);
+ Ops.push_back(TDescr);
+ Ops.push_back(M->getChain());
+
+ auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
+ MachineMemOperand *MemRef = M->getMemOperand();
+ DAG.setNodeMemRefs(NewNode, {MemRef});
+ return SDValue(NewNode, 0);
+ }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue NodePtr = M->getOperand(2);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index de77401eb0137..b2315bc80f0a4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4362,7 +4362,15 @@ def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction {
def G_AMDGPU_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
- let InOperandList = (ins unknown:$intrin, variable_ops);
+ let InOperandList = (ins unknown:$opcode, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+def G_AMDGPU_BVH_DUAL_INTERSECT_RAY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst, type1:$ray_origin, type1:$ray_dir);
+ let InOperandList = (ins unknown:$opcode, variable_ops);
let hasSideEffects = 0;
let mayLoad = 1;
let mayStore = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
new file mode 100644
index 0000000000000..710d802016aeb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dual_intersect_ray.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+declare {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64, float, i8, <3 x float>, <3 x float>, <2 x i32>, <4 x i32>)
+
+define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) {
+; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3]
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off
+; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3]
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off
+; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
+ %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+ %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+ %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
+ %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+ %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+ %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr)
+ %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
+ %r = bitcast <10 x i32> %a to <10 x float>
+ %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
+ store <3 x float> %o, ptr addrspace(1) %origin
+ %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
+ store <3 x float> %d, ptr addrspace(1) %dir
+ ret <10 x float> %r
+}
+
+define amdgpu_ps <10 x float> @image_bvh_dual_intersect_ray_1(i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, <2 x i32> %offsets, <4 x i32> inreg %tdescr, ptr addrspace(1) %origin, ptr addrspace(1) %dir) {
+; GFX12-SDAG-LABEL: image_bvh_dual_intersect_ray_1:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX12-SDAG-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[17:19], v[20:22], v[9:10]], s[0:3]
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b96 v[11:12], v[17:19], off
+; GFX12-SDAG-NEXT: global_store_b96 v[13:14], v[20:22], off
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: image_bvh_dual_intersect_ray_1:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v15, v3 :: v_dual_mov_b32 v16, v4
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v17, v5 :: v_dual_mov_b32 v18, v6
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v19, v7 :: v_dual_mov_b32 v20, v8
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 1
+; GFX12-GISEL-NEXT: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[2:3], v[15:17], v[18:20], v[9:10]], s[0:3]
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: global_store_b96 v[11:12], v[15:17], off
+; GFX12-GISEL-NEXT: global_store_b96 v[13:14], v[18:20], off
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
+ %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+ %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+ %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
+ %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+ %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+ %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh.dual.intersect.ray(i64 %node_ptr, float %ray_extent, i8 1, <3 x float> %ray_origin, <3 x float> %ray_dir, <2 x i32> %offsets, <4 x i32> %tdescr)
+ %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
+ %r = bitcast <10 x i32> %a to <10 x float>
+ %o = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
+ store <3 x float> %o, ptr addrspace(1) %origin
+ %d = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
+ store <3 x float> %d, ptr addrspace(1) %dir
+ ret <10 x float> %r
+}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
index 8bf9b92e8d1d8..3ca8f4308a0ee 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
@@ -1066,6 +1066,9 @@ image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]],
image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16
// GFX12: encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f]
+image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3]
+// GFX12: encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06]
+
image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D
// GFX12: encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
index b9999b671f7e7..0148ff6cabc93 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage_alias.s
@@ -41,3 +41,6 @@ bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7]
bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7]
// GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], s[4:7] ; encoding: [0x10,0x80,0xc6,0xd3,0x04,0x08,0x00,0x12,0x09,0x0b,0x0c,0x0f]
+
+bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3]
+// GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
index 233c2e1b9d083..afb7c3c24db17 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
@@ -1066,6 +1066,9 @@
# GFX12: image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17]], s[4:7] a16 ; encoding: [0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f]
0x50,0x80,0xc6,0xd3,0x04,0x08,0x00,0x00,0x09,0x0b,0x0c,0x0f
+# GFX12: image_bvh_dual_intersect_ray v[0:9], [v[0:1], v[11:12], v[3:5], v[6:8], v[9:10]], s[0:3] ; encoding: [0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06]
+0x10,0x00,0xe0,0xd3,0x00,0x00,0x00,0x09,0x00,0x0b,0x03,0x06
+
# GFX12: image_get_resinfo v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00]
0x00,0xc0,0x45,0xd0,0x04,0xc0,0x00,0x00,0x20,0x00,0x00,0x00
>From 6fafb9873a01c9294ed1b73e7e4bfa470c7939a9 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Fri, 14 Mar 2025 12:58:20 -0400
Subject: [PATCH 2/2] Changes after review
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 34 ++++++++-----------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
3 files changed, 17 insertions(+), 21 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f93439b30523e..a1cfcfa8e67d6 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2810,7 +2810,7 @@ def int_amdgcn_image_bvh_dual_intersect_ray :
Intrinsic<[llvm_v10i32_ty, llvm_v3f32_ty, llvm_v3f32_ty],
[llvm_i64_ty, llvm_float_ty, llvm_i8_ty, llvm_v3f32_ty,
llvm_v3f32_ty, llvm_v2i32_ty, llvm_v4i32_ty],
- [IntrReadMem, IntrWillReturn]>;
+ [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index cd0554a5c5b99..03d75d006c982 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7214,25 +7214,21 @@ bool AMDGPULegalizerInfo::legalizeBVHDualIntrinsic(MachineInstr &MI,
NumVAddrDwords);
assert(Opcode != -1);
- SmallVector<Register, 12> Ops;
- Ops.push_back(NodePtr);
- Ops.push_back(B.buildMergeLikeInstr(
- V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)})
- .getReg(0));
- Ops.push_back(RayOrigin);
- Ops.push_back(RayDir);
- Ops.push_back(Offsets);
-
- auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
- .addDef(DstReg)
- .addDef(DstOrigin)
- .addDef(DstDir)
- .addImm(Opcode);
-
- for (Register R : Ops)
- MIB.addUse(R);
-
- MIB.addUse(TDescr).cloneMemRefs(MI);
+ auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
+ V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
+
+ B.buildInstr(AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
+ .addDef(DstReg)
+ .addDef(DstOrigin)
+ .addDef(DstDir)
+ .addImm(Opcode)
+ .addUse(NodePtr)
+ .addUse(RayExtentInstanceMaskVec.getReg(0))
+ .addUse(RayOrigin)
+ .addUse(RayDir)
+ .addUse(Offsets)
+ .addUse(TDescr)
+ .cloneMemRefs(MI);
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c5166c459d7ac..5a9887101bfa2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9465,7 +9465,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
NumVAddrDwords);
assert(Opcode != -1);
- SmallVector<SDValue, 16> Ops;
+ SmallVector<SDValue, 7> Ops;
Ops.push_back(NodePtr);
Ops.push_back(DAG.getBuildVector(
MVT::v2i32, DL,
More information about the llvm-commits
mailing list