[llvm] 3822a01 - [AMDGPU] Add GFX11 ds_bvh_stack_rtn_b32 instruction
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 15 08:49:17 PDT 2022
Author: Jay Foad
Date: 2022-09-15T16:46:14+01:00
New Revision: 3822a01e0be2c8b79d1102a46a22454f96550dce
URL: https://github.com/llvm/llvm-project/commit/3822a01e0be2c8b79d1102a46a22454f96550dce
DIFF: https://github.com/llvm/llvm-project/commit/3822a01e0be2c8b79d1102a46a22454f96550dce.diff
LOG: [AMDGPU] Add GFX11 ds_bvh_stack_rtn_b32 instruction
Differential Revision: https://reviews.llvm.org/D133928
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/MC/AMDGPU/gfx11_asm_ds.s
llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 189780d6407b..35791362d066 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2004,6 +2004,18 @@ def int_amdgcn_ds_sub_gs_reg_rtn :
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
+def int_amdgcn_ds_bvh_stack_rtn :
+ Intrinsic<
+ [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
+ [
+ llvm_i32_ty, // %addr
+ llvm_i32_ty, // %data0
+ llvm_v4i32_ty, // %data1
+ llvm_i32_ty, // %offset
+ ],
+ [ImmArg<ArgIndex<3>>, IntrWillReturn]
+ >;
+
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2e1d6b27b3ae..303cbe5657bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -43,7 +43,6 @@ using namespace llvm;
//===----------------------------------------------------------------------===//
namespace {
-
static SDValue stripBitcast(SDValue Val) {
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}
@@ -96,7 +95,7 @@ static SDValue stripExtractLoElt(SDValue In) {
return In;
}
-} // end anonymous namespace
+} // end anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
@@ -2380,6 +2379,19 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
+ unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
+ SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
+ N->getOperand(5), N->getOperand(0)};
+
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
@@ -2532,6 +2544,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
SelectDSAppendConsume(N, IntrID);
return;
}
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ SelectDSBvhStackIntrinsic(N);
+ return;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index b5e39f6ed777..1a8e8f622cdd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -268,6 +268,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+ void SelectDSBvhStackIntrinsic(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1fa759620017..f1068647f84b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1803,6 +1803,33 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
return true;
}
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
+ MachineInstr &MI) const {
+ Register Dst0 = MI.getOperand(0).getReg();
+ Register Dst1 = MI.getOperand(1).getReg();
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ Register Addr = MI.getOperand(3).getReg();
+ Register Data0 = MI.getOperand(4).getReg();
+ Register Data1 = MI.getOperand(5).getReg();
+ unsigned Offset = MI.getOperand(6).getImm();
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
+ .addDef(Dst1)
+ .addUse(Addr)
+ .addUse(Data0)
+ .addUse(Data1)
+ .addImm(Offset)
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
unsigned IntrinsicID = I.getIntrinsicID();
@@ -1841,6 +1868,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return false;
}
break;
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn:
+ return selectDSBvhStackIntrinsic(I);
}
return selectImpl(I, *CoverageInfo);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index d8caaed2810f..2175d9a1f140 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectSBarrier(MachineInstr &MI) const;
+ bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ca78c4a64d44..abd0fbf1af6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4745,6 +4745,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+ OpdsMapping[0] =
+ getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
+ OpdsMapping[1] =
+ getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
+ OpdsMapping[3] =
+ getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
+ OpdsMapping[4] =
+ getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
+ OpdsMapping[5] =
+ getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
+ break;
+ }
+
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index cfdbaab52b8d..6ae1dcbb77ae 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6382,11 +6382,20 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
bool IsGdsHardcoded) {
OptionalImmIndexMap OptionalIdx;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset;
for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ auto TiedTo =
+ Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO);
+
+ if (TiedTo != -1) {
+ assert((unsigned)TiedTo < Inst.getNumOperands());
+ Inst.addOperand(Inst.getOperand(TiedTo));
+ }
+
// Add the register arguments
if (Op.isReg()) {
Op.addRegOperands(Inst, 1);
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d8387bf6f1ae..292e85f9b11a 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -277,6 +277,19 @@ multiclass DS_1A2D_Off8_RET_mc<string opName,
}
}
+class DS_BVH_STACK<string opName>
+: DS_Pseudo<opName,
+ (outs getLdStRegisterOperand<VGPR_32>.ret:$vdst, VGPR_32:$addr),
+ (ins VGPR_32:$addr_in, getLdStRegisterOperand<VGPR_32>.ret:$data0, VReg_128:$data1, offset:$offset),
+ " $vdst, $addr, $data0, $data1$offset"> {
+ let Constraints = "$addr = $addr_in";
+ let DisableEncoding = "$addr_in";
+ let has_gds = 0;
+ let gdsValue = 0;
+ // TODO: Use MMOs in the LDS address space instead of hasSideEffects = 1.
+ let hasSideEffects = 1;
+ let SchedRW = [WriteLDS, WriteLDS];
+}
class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset,
RegisterOperand data_op = getLdStRegisterOperand<rc>.ret>
@@ -713,6 +726,7 @@ let SubtargetPredicate = isGFX11Plus in {
def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
} // let SubtargetPredicate = isGFX11Plus
@@ -1250,6 +1264,7 @@ defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>;
defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>;
defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>;
defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>;
+defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 99b3c2b17d0f..3979ff1a14bc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1161,6 +1161,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.ptrVal = MFI->getGWSPSV(TM);
+
+ // This is an abstract access, but we need to specify a type and size.
+ Info.memVT = MVT::i32;
+ Info.size = 4;
+ Info.align = Align(4);
+
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
default:
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
new file mode 100644
index 000000000000..085eb9dee016
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+
+declare { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32, i32, <4 x i32>, i32 immarg)
+
+define amdgpu_gs void @test_ds_bvh_stack(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
+; CHECK-LABEL: test_ds_bvh_stack:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; CHECK-NEXT: global_store_b32 v[6:7], v0, off
+; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ %res = add i32 %vdst, %newaddr
+ store i32 %res, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_1(i32 %addr, i32 %data0, <4 x i32> %data1, i32 addrspace(1)* %out) {
+; CHECK-LABEL: test_ds_bvh_stack_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ds_bvh_stack_rtn_b32 v1, v0, v1, v[2:5] offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; CHECK-NEXT: global_store_b32 v[6:7], v0, off
+; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT: s_endpgm
+ %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1)
+ %vdst = extractvalue { i32, i32 } %pair, 0
+ %newaddr = extractvalue { i32, i32 } %pair, 1
+ %res = add i32 %vdst, %newaddr
+ store i32 %res, i32 addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_ds.s b/llvm/test/MC/AMDGPU/gfx11_asm_ds.s
index 4cee84aa01c8..4e95aba5d651 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_ds.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_ds.s
@@ -1961,3 +1961,9 @@ ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7)
ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip")
// GFX11: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08]
+
+ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252]
+// GFX11: [0x00,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff]
+
+ds_bvh_stack_rtn_b32 v255, v254, v253, v[249:252] offset:127
+// GFX11: [0x7f,0x00,0xb4,0xda,0xfe,0xfd,0xf9,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
index 2debd902ce20..0d2129a85248 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_ds.txt
@@ -5166,3 +5166,15 @@
# GFX11: ds_sub_gs_reg_rtn v[1:2], v255 gds ; encoding: [0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01]
0x00,0x00,0xee,0xd9,0x00,0xff,0x00,0x01
+
+# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01]
+0x7f,0x00,0xb4,0xda,0x02,0x03,0x04,0x01
+
+# GFX11: ds_bvh_stack_rtn_b32 v1, v2, v3, v[4:7] ; encoding: [0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01]
+0x00,0x00,0xb4,0xda,0x02,0x03,0x04,0x01
+
+# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] offset:127 ; encoding: [0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe]
+0x7f,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe
+
+# GFX11: ds_bvh_stack_rtn_b32 v254, v255, v253, v[5:8] ; encoding: [0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe]
+0x00,0x00,0xb4,0xda,0xff,0xfd,0x05,0xfe
More information about the llvm-commits
mailing list