[llvm] 7ecf196 - [AMDGPU] Fix and extend vccz workarounds
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 18 07:28:35 PST 2020
Author: Jay Foad
Date: 2020-11-18T15:26:06Z
New Revision: 7ecf19697ee21a23fc8782daefcb268085874d7f
URL: https://github.com/llvm/llvm-project/commit/7ecf19697ee21a23fc8782daefcb268085874d7f
DIFF: https://github.com/llvm/llvm-project/commit/7ecf19697ee21a23fc8782daefcb268085874d7f.diff
LOG: [AMDGPU] Fix and extend vccz workarounds
We have workarounds for two different cases where vccz can get out of
sync with the value in vcc. This fixes them in two ways:
1. Fix the case where the def of vcc was in a previous basic block, by
pessimistically assuming that vccz might be incorrect at a basic block
boundary.
2. Fix the handling of pre-existing waitcnt instructions by calling
generateWaitcntInstBefore before examining ScoreBrackets to determine
whether there's an outstanding smem read operation.
Differential Revision: https://reviews.llvm.org/D91636
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/infinite-loop.ll
llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8fbfab9f2d77..c80263df4cb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1428,9 +1428,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
- // Assume VCCZ is correct at basic block boundaries, unless and until we need
- // to handle cases where that is not true.
+ // Track the correctness of vccz through this basic block. There are two
+ // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
bool VCCZCorrect = true;
+ if (ST->hasReadVCCZBug()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc and then issued an smem load.
+ VCCZCorrect = false;
+ } else if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // vccz could be incorrect at a basic block boundary if a predecessor wrote
+ // to vcc_lo or vcc_hi.
+ VCCZCorrect = false;
+ }
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
@@ -1451,15 +1461,21 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
- // We might need to restore vccz to its correct value for either of two
- //
diff erent reasons; see ST->hasReadVCCZBug() and
- // ST->partialVCCWritesUpdateVCCZ().
- bool RestoreVCCZ = false;
- if (readsVCCZ(Inst)) {
- if (!VCCZCorrect) {
- // Restore vccz if it's not known to be correct already.
- RestoreVCCZ = true;
- } else if (ST->hasReadVCCZBug()) {
+ // Generate an s_waitcnt instruction to be placed before Inst, if needed.
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ OldWaitcntInstr = nullptr;
+
+ // Restore vccz if it's not known to be correct already.
+ bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+
+ // Don't examine operands unless we need to track vccz correctness.
+ if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI)) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ if (!ST->partialVCCWritesUpdateVCCZ())
+ VCCZCorrect = false;
+ } else if (Inst.definesRegister(AMDGPU::VCC)) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
// corrupt vccz bit, we need to:
@@ -1467,12 +1483,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// operations to complete.
// 2. Restore the correct value of vccz by writing the current value
// of vcc back to vcc.
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ if (ST->hasReadVCCZBug() &&
+ ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- // Restore vccz if there's an outstanding smem read, which could
- // complete and clobber vccz at any time.
- RestoreVCCZ = true;
+ // Writes to vcc while there's an outstanding smem read may get
+ // clobbered as soon as any read completes.
+ VCCZCorrect = false;
+ } else {
+ // Writes to vcc will fix any incorrect value in vccz.
+ VCCZCorrect = true;
}
}
}
@@ -1482,24 +1502,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
const Value *Ptr = Memop->getValue();
SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
}
- }
-
- if (!ST->partialVCCWritesUpdateVCCZ()) {
- if (Inst.definesRegister(AMDGPU::VCC_LO) ||
- Inst.definesRegister(AMDGPU::VCC_HI)) {
- // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ if (ST->hasReadVCCZBug()) {
+ // This smem read could complete and clobber vccz at any time.
VCCZCorrect = false;
- } else if (Inst.definesRegister(AMDGPU::VCC)) {
- // Writes to vcc will fix any incorrect value in vccz.
- VCCZCorrect = true;
}
}
- // Generate an s_waitcnt instruction to be placed before
- // cur_Inst, if needed.
- Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
- OldWaitcntInstr = nullptr;
-
updateEventWaitcntAfter(Inst, &ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index d82d90564aa4..b488c48903ce 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -45,6 +45,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB1_2
; SI-NEXT: BB1_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
@@ -86,6 +87,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB2_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
@@ -103,6 +105,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
; SI-NEXT: BB2_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccz BB2_6
; SI-NEXT: BB2_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 8bdc05bafacd..03a49f199350 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -42,6 +42,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
; GCN-NEXT: s_and_b64 vcc, exec, 0
; GCN-NEXT: BB0_6: ; %bb9
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_mov_b64 vcc, vcc
; GCN-NEXT: s_cbranch_vccz BB0_6
; GCN-NEXT: BB0_7: ; %DummyReturnBlock
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 06f09e8e4d07..8f1e2489e6fd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -227,6 +227,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; SI-NEXT: s_and_b64 vcc, exec, 0
; SI-NEXT: BB3_12: ; %self.loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccz BB3_12
; SI-NEXT: BB3_13: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
@@ -298,6 +299,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; FLAT-NEXT: s_and_b64 vcc, exec, 0
; FLAT-NEXT: BB3_12: ; %self.loop
; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT: s_mov_b64 vcc, vcc
; FLAT-NEXT: s_cbranch_vccz BB3_12
; FLAT-NEXT: BB3_13: ; %DummyReturnBlock
; FLAT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index d1dac8c8c782..6bf4fcef66a8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -15,6 +15,7 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
; CHECK-NEXT: s_and_b64 vcc, exec, 0
; CHECK-NEXT: BB0_2: ; %bb10
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccz BB0_2
; CHECK-NEXT: BB0_3: ; %DummyReturnBlock
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 79233141070d..2d93f0ec00ce 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -168,6 +168,8 @@ body: |
# CHECK-LABEL: name: vcc_def_pred
# CHECK: bb.1:
+# SI: $vcc = S_MOV_B64 $vcc
+# GFX9: $vcc = S_MOV_B64 $vcc
# CHECK: S_CBRANCH_VCCZ %bb.2, implicit $vcc
name: vcc_def_pred
@@ -243,7 +245,7 @@ body: |
# SI-NEXT: $vcc = S_MOV_B64 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: load_def_wait_nop_use
body: |
@@ -298,7 +300,7 @@ body: |
# SI-NEXT: $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0
# SI-NEXT: S_WAITCNT 127
# SI-NEXT: S_NOP 0
-# FIXME should have $vcc = S_MOV_B64 $vcc
+# SI-NEXT: $vcc = S_MOV_B64 $vcc
# SI-NEXT: S_CBRANCH_VCCZ %bb.1, implicit $vcc
name: def_load_wait_nop_use
body: |
More information about the llvm-commits
mailing list