[llvm] r365910 - [AMDGPU] Fix DPP combiner check for exec modification
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 12 08:59:40 PDT 2019
Author: foad
Date: Fri Jul 12 08:59:40 2019
New Revision: 365910
URL: http://llvm.org/viewvc/llvm-project?rev=365910&view=rev
Log:
[AMDGPU] Fix DPP combiner check for exec modification
Summary:
r363675 changed the exec modification helper function, now called
execMayBeModifiedBeforeUse, so that if no UseMI is specified it checks
all instructions in the basic block, even beyond the last use. That
meant that the DPP combiner no longer worked in any basic block that
ended with a control flow instruction, and in particular it didn't work
on code sequences generated by the atomic optimizer.
Fix it by reinstating the old behaviour but in a new helper function
execMayBeModifiedBeforeAnyUse, and limiting the number of instructions
scanned.
Reviewers: arsenm, vpykhtin
Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D64393
Modified:
llvm/trunk/lib/Target/AMDGPU/GCNDPPCombine.cpp
llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
llvm/trunk/test/CodeGen/AMDGPU/dpp_combine.mir
Modified: llvm/trunk/lib/Target/AMDGPU/GCNDPPCombine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNDPPCombine.cpp?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNDPPCombine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNDPPCombine.cpp Fri Jul 12 08:59:40 2019
@@ -344,7 +344,7 @@ bool GCNDPPCombine::combineDPPMov(Machin
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
assert(DstOpnd && DstOpnd->isReg());
auto DPPMovReg = DstOpnd->getReg();
- if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) {
+ if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
" for all uses\n");
return false;
Modified: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp Fri Jul 12 08:59:40 2019
@@ -650,7 +650,7 @@ void SIFoldOperands::foldOperand(
if (execMayBeModifiedBeforeUse(*MRI,
UseMI->getOperand(UseOpIdx).getReg(),
*OpToFold.getParent(),
- UseMI))
+ *UseMI))
return;
UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
@@ -669,7 +669,7 @@ void SIFoldOperands::foldOperand(
if (execMayBeModifiedBeforeUse(*MRI,
UseMI->getOperand(UseOpIdx).getReg(),
*OpToFold.getParent(),
- UseMI))
+ *UseMI))
return;
// %vgpr = COPY %sgpr0
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Jul 12 08:59:40 2019
@@ -6269,47 +6269,75 @@ MachineInstr *llvm::getVRegSubRegDef(con
}
bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
- unsigned VReg,
+ Register VReg,
const MachineInstr &DefMI,
- const MachineInstr *UseMI) {
+ const MachineInstr &UseMI) {
assert(MRI.isSSA() && "Must be run on SSA");
auto *TRI = MRI.getTargetRegisterInfo();
auto *DefBB = DefMI.getParent();
- if (UseMI) {
+ // Don't bother searching between blocks, although it is possible this block
+ // doesn't modify exec.
+ if (UseMI.getParent() != DefBB)
+ return true;
+
+ const int MaxInstScan = 20;
+ int NumInst = 0;
+
+ // Stop scan at the use.
+ auto E = UseMI.getIterator();
+ for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+ if (I->isDebugInstr())
+ continue;
+
+ if (++NumInst > MaxInstScan)
+ return true;
+
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ return true;
+ }
+
+ return false;
+}
+
+bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI) {
+ assert(MRI.isSSA() && "Must be run on SSA");
+
+ auto *TRI = MRI.getTargetRegisterInfo();
+ auto *DefBB = DefMI.getParent();
+
+ const int MaxUseInstScan = 10;
+ int NumUseInst = 0;
+
+ for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
// Don't bother searching between blocks, although it is possible this block
// doesn't modify exec.
- if (UseMI->getParent() != DefBB)
+ if (UseInst.getParent() != DefBB)
+ return true;
+
+ if (++NumUseInst > MaxUseInstScan)
return true;
- } else {
- int NumUse = 0;
- const int MaxUseScan = 10;
-
- for (auto &UseInst : MRI.use_nodbg_instructions(VReg)) {
- if (UseInst.getParent() != DefBB)
- return true;
-
- if (NumUse++ > MaxUseScan)
- return true;
- }
}
const int MaxInstScan = 20;
- int NumScan = 0;
+ int NumInst = 0;
- // Stop scan at the use if known.
- auto E = UseMI ? UseMI->getIterator() : DefBB->end();
- for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
+ // Stop scan when we have seen all the uses.
+ for (auto I = std::next(DefMI.getIterator()); ; ++I) {
if (I->isDebugInstr())
continue;
- if (NumScan++ > MaxInstScan)
+ if (++NumInst > MaxInstScan)
return true;
+ if (I->readsRegister(VReg))
+ if (--NumUseInst == 0)
+ return false;
+
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
return true;
}
-
- return false;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Fri Jul 12 08:59:40 2019
@@ -1016,13 +1016,19 @@ MachineInstr *getVRegSubRegDef(const Tar
MachineRegisterInfo &MRI);
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
-/// DefMI and uses. If \p UseMI is not specified, this checks all uses of \p
-/// VReg. Should be run on SSA. Currently does not attempt to track between
-/// blocks.
+/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
+/// attempt to track between blocks.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
- unsigned VReg,
+ Register VReg,
const MachineInstr &DefMI,
- const MachineInstr *UseMI = nullptr);
+ const MachineInstr &UseMI);
+
+/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
+/// DefMI and all its uses. Should be run on SSA. Currently does not attempt to
+/// track between blocks.
+bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
+ Register VReg,
+ const MachineInstr &DefMI);
namespace AMDGPU {
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll Fri Jul 12 08:59:40 2019
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -42,6 +42,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
@@ -133,6 +135,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll Fri Jul 12 08:59:40 2019
@@ -45,6 +45,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
@@ -136,6 +138,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll Fri Jul 12 08:59:40 2019
@@ -39,6 +39,8 @@ else:
; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll Fri Jul 12 08:59:40 2019
@@ -44,6 +44,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
@@ -104,6 +106,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll Fri Jul 12 08:59:40 2019
@@ -44,6 +44,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
+; GFX8MORE: v_add_u32_dpp
+; GFX8MORE: v_add_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_add v[[value]]
@@ -117,6 +119,8 @@ entry:
; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
; GFX7LESS-NOT: s_bcnt1_i32_b64
; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
+; GFX8MORE: v_sub{{(rev)?}}_u32_dpp
; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
; GFX8MORE: buffer_atomic_sub v[[value]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/dpp_combine.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/dpp_combine.mir?rev=365910&r1=365909&r2=365910&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/dpp_combine.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/dpp_combine.mir Fri Jul 12 08:59:40 2019
@@ -380,6 +380,38 @@ body: |
%9:vgpr_32 = V_SUB_I32_e32 5, %7, implicit-def $vcc, implicit $exec
...
+# tests on sequences of dpp consumers followed by control flow
+# CHECK-LABEL: name: dpp_seq_cf
+# CHECK: %4:vgpr_32 = V_ADD_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %5:vgpr_32 = V_SUBREV_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit-def $vcc, implicit $exec
+# CHECK: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec
+
+name: dpp_seq_cf
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0, $vgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+ %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec
+ %4:vgpr_32 = V_ADD_I32_e32 %3, %1, implicit-def $vcc, implicit $exec
+ %5:vgpr_32 = V_SUB_I32_e32 %1, %3, implicit-def $vcc, implicit $exec
+ %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
+
+ %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
+ %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+
+ bb.2:
+ SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+...
+
# old reg def is in diff BB - cannot combine
# CHECK-LABEL: name: old_in_diff_bb
# CHECK: %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec
More information about the llvm-commits
mailing list