[llvm] c262b69 - [AMDGPU] Fix endcf collapse
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 13:50:31 PDT 2020
Author: Stanislav Mekhanoshin
Date: 2020-03-13T13:50:21-07:00
New Revision: c262b69dcc0a280ec4d551244b3571123c36a370
URL: https://github.com/llvm/llvm-project/commit/c262b69dcc0a280ec4d551244b3571123c36a370
DIFF: https://github.com/llvm/llvm-project/commit/c262b69dcc0a280ec4d551244b3571123c36a370.diff
LOG: [AMDGPU] Fix endcf collapse
Only collapse inner endcf if the outer one belongs to SI_IF.
If it does belong to SI_ELSE then mask being restored in fact
a partial inverse of what we need.
Differential Revision: https://reviews.llvm.org/D76154
Added:
Modified:
llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index de9d8fa29005..b46a515c0b5e 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -76,7 +76,7 @@ using namespace llvm;
static cl::opt<bool>
RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
- cl::init(false), cl::ReallyHidden);
+ cl::init(true), cl::ReallyHidden);
namespace {
@@ -87,6 +87,7 @@ class SILowerControlFlow : public MachineFunctionPass {
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
DenseSet<const MachineInstr*> LoweredEndCf;
+ DenseSet<Register> LoweredIf;
const TargetRegisterClass *BoolRC = nullptr;
unsigned AndOpc;
@@ -212,6 +213,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
.addReg(Exec)
.addReg(Exec, RegState::ImplicitDefine);
+ LoweredIf.insert(CopyReg);
Register Tmp = MRI->createVirtualRegister(BoolRC);
@@ -453,11 +455,19 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
LoweredEndCf.count(&*Next))) {
- LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
- if (LIS)
- LIS->RemoveMachineInstrFromMaps(MI);
- MI.eraseFromParent();
- return;
+ // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+ // If that belongs to SI_ELSE then saved mask has an inverted value.
+ Register SavedExec = Next->getOperand(0).getReg();
+ const MachineInstr *Def = MRI.getUniqueVRegDef(SavedExec);
+ // A lowered SI_IF turns definition into COPY of exec.
+ if (Def && (Def->getOpcode() == AMDGPU::SI_IF ||
+ LoweredIf.count(SavedExec))) {
+ LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MI.eraseFromParent();
+ return;
+ }
}
}
@@ -617,6 +627,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
LoweredEndCf.clear();
+ LoweredIf.clear();
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index c2676eaeb989..1af2ca55308b 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -142,16 +142,15 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
; GCN-NEXT: ; %bb.{{[0-9]+}}:
; GCN: store_dword
-; GCN-NEXT: s_and_b64 exec, exec,
+; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_ELSE:s\[[0-9:]+\]]],
; GCN-NEXT: s_cbranch_execz [[FLOW1:BB[0-9_]+]]
; GCN-NEXT: ; %bb.{{[0-9]+}}:
; GCN: store_dword
; GCN-NEXT: [[FLOW1]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
-; GCN-NOT: s_or_b64 exec
-; GCN-NOT: {{^.*:}}
-; GCN: ds_write_b32
-; GCN: s_endpgm
+; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_ELSE]]
+; GCN: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
+; GCN: ds_write_b32
+; GCN: s_endpgm
define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index 277797fc2fba..7760c921149e 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -834,3 +834,124 @@ body: |
SI_END_CF %2:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
S_BRANCH %bb.1
+
+...
+
+# Both s_or_b64 shall be preserved since the outer SI_END_CF belongs to SI_ELSE.
+
+---
+name: simple_outer_if_else
+tracksRegLiveness: true
+liveins:
+ - { reg: '$vgpr0', virtual-reg: '%0' }
+ - { reg: '$sgpr0_sgpr1', virtual-reg: '%1' }
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: simple_outer_if_else
+ ; GCN: bb.0:
+ ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+ ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 1, [[COPY1]], implicit $exec
+ ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+ ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_LT_U32_e64_]], implicit-def dead $scc
+ ; GCN: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc
+ ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]]
+ ; GCN: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GCN: bb.1:
+ ; GCN: successors: %bb.2(0x80000000)
+ ; GCN: S_BRANCH %bb.2
+ ; GCN: bb.2:
+ ; GCN: successors: %bb.3(0x40000000), %bb.6(0x40000000)
+ ; GCN: [[COPY3:%[0-9]+]]:sreg_64 = COPY [[S_XOR_B64_]]
+ ; GCN: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[COPY3]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GCN: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+ ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GCN: bb.3:
+ ; GCN: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; GCN: undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4)
+ ; GCN: undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
+ ; GCN: %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+ ; GCN: undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec
+ ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY4]], %9, 0, implicit $exec
+ ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440
+ ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0
+ ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+ ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
+ ; GCN: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+ ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+ ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+ ; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec
+ ; GCN: bb.4:
+ ; GCN: successors: %bb.5(0x80000000)
+ ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2
+ ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+ ; GCN: bb.5:
+ ; GCN: successors: %bb.6(0x80000000)
+ ; GCN: $exec = S_OR_B64 $exec, [[COPY5]], implicit-def $scc
+ ; GCN: bb.6:
+ ; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc
+ ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+ ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN: $m0 = S_MOV_B32 -1
+ ; GCN: DS_WRITE_B32 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
+ ; GCN: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0, $sgpr0_sgpr1
+
+ %1:sgpr_64 = COPY $sgpr0_sgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %2:sreg_64 = V_CMP_LT_U32_e64 1, %0, implicit $exec
+ %3:sreg_64 = SI_IF %2:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3, %bb.6
+ %4:sreg_64 = SI_ELSE %3:sreg_64, %bb.6, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.3:
+ successors: %bb.3, %bb.4
+
+ undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 9, 0, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4)
+ undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, %0, implicit $exec
+ %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ %7:vgpr_32 = COPY %5.sub1
+ undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_I32_e64 %5.sub0, %6.sub0, 0, implicit $exec
+ %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec
+ %5.sub3:sgpr_128 = S_MOV_B32 61440
+ %5.sub2:sgpr_128 = S_MOV_B32 0
+ BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+ %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec
+ %12:sreg_64 = SI_IF %11:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.4:
+ successors: %bb.5
+
+ %5.sub0:sgpr_128 = COPY %5.sub2
+ %5.sub1:sgpr_128 = COPY %5.sub2
+ %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+
+ bb.5:
+ successors: %bb.6
+
+ SI_END_CF %12:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.6:
+ SI_END_CF %4:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ %15:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+ %16:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ $m0 = S_MOV_B32 -1
+ DS_WRITE_B32 %16, %15, 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 9f717df480fb..5f0b5aaecac9 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -58,7 +58,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
-; GFX9-NEXT: s_cbranch_execz BB1_4
+; GFX9-NEXT: s_cbranch_execz BB1_3
; GFX9-NEXT: ; %bb.1: ; %bb19
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6
@@ -100,9 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GFX9-NEXT: s_cbranch_execnz BB1_2
-; GFX9-NEXT: ; %bb.3: ; %Flow
-; GFX9-NEXT: s_or_b64 exec, exec, s[12:13]
-; GFX9-NEXT: BB1_4: ; %Flow3
+; GFX9-NEXT: BB1_3: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list