[llvm] 2a68364 - [AMDGPU] gfx11 waitcnt support for VINTERP and LDSDIR instructions
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 17 07:00:04 PDT 2022
Author: Joe Nash
Date: 2022-06-17T09:30:37-04:00
New Revision: 2a683647455f0261c82d5c19790c101ed4e4cc1c
URL: https://github.com/llvm/llvm-project/commit/2a683647455f0261c82d5c19790c101ed4e4cc1c
DIFF: https://github.com/llvm/llvm-project/commit/2a683647455f0261c82d5c19790c101ed4e4cc1c.diff
LOG: [AMDGPU] gfx11 waitcnt support for VINTERP and LDSDIR instructions
Reviewed By: rampitec, #amdgpu
Differential Revision: https://reviews.llvm.org/D127781
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 96efef2cc051..8e867b0fc68c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -87,29 +87,29 @@ struct RegisterEncoding {
};
enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_WRITE_ACCESS,// vector-memory write
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS, // vector-memory write
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};
static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
- (1 << SQ_MESSAGE),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
- (1 << VMEM_WRITE_ACCESS)
-};
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+ (1 << SQ_MESSAGE),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS)};
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -596,6 +596,12 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
CurrScore);
}
+ } else if (TII->isLDSDIR(Inst)) {
+ // LDSDIR instructions attach the score to the destination.
+ setExpScore(
+ &Inst, TII, TRI, MRI,
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
+ CurrScore);
} else {
if (TII->isEXP(Inst)) {
// For export the destination registers are really temps that
@@ -1135,7 +1141,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
ScoreBrackets.clearVgprVmemTypes(RegNo);
}
- if (Op.isDef()) {
+ if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
@@ -1192,6 +1198,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
ScoreBrackets.applyWaitcnt(Wait);
}
+ // ExpCnt can be merged into VINTERP.
+ if (Wait.ExpCnt != ~0u && SIInstrInfo::isVINTERP(MI)) {
+ MachineOperand *WaitExp = TII->getNamedOperand(MI, AMDGPU::OpName::waitexp);
+ if (Wait.ExpCnt < WaitExp->getImm()) {
+ WaitExp->setImm(Wait.ExpCnt);
+ Modified = true;
+ }
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ << "Update Instr: " << MI);
+ }
+
// Build new waitcnt instructions unless no wait is needed or the old waitcnt
// instruction was modified to handle the required wait.
if (Wait.hasWaitExceptVsCnt()) {
@@ -1350,6 +1369,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
}
+ } else if (SIInstrInfo::isLDSDIR(Inst)) {
+ ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+ } else if (TII->isVINTERP(Inst)) {
+ int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+ ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
} else if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index b2e1542c1bc0..8c4602ed9504 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -9,8 +9,8 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: lds_param_load v0, attr0.y
; GCN-NEXT: lds_param_load v1, attr1.x
; GCN-NEXT: v_mov_b32_e32 v4, s1
-; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
@@ -36,10 +36,10 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: lds_param_load v2, attr2.x
; GCN-NEXT: lds_param_load v3, attr3.x
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
+; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
+; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
@@ -73,10 +73,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: lds_param_load v4, attr2.x
; GCN-NEXT: lds_param_load v5, attr3.x
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
+; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
+; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
@@ -111,7 +111,7 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: lds_param_load v0, attr0.x
; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 0e00a67fcf98..ef7bcf9828af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -9,8 +9,8 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: lds_param_load v0, attr0.y
; GCN-NEXT: lds_param_load v1, attr1.x
; GCN-NEXT: v_mov_b32_e32 v4, s1
-; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
@@ -36,10 +36,10 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: lds_param_load v2, attr2.x
; GCN-NEXT: lds_param_load v3, attr3.x
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
+; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
+; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
@@ -73,10 +73,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: lds_param_load v4, attr2.x
; GCN-NEXT: lds_param_load v5, attr3.x
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
-; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
+; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
+; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
+; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
@@ -111,7 +111,7 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: lds_param_load v0, attr0.x
; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
More information about the llvm-commits
mailing list