[llvm] e9b236f - AMDGPU: Check for other defs when folding conditions into s_andn2_b64
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 28 13:36:30 PDT 2020
Author: Matt Arsenault
Date: 2020-07-28T16:36:23-04:00
New Revision: e9b236f411c5683d270a381bf810ba3c8f3ed12c
URL: https://github.com/llvm/llvm-project/commit/e9b236f411c5683d270a381bf810ba3c8f3ed12c
DIFF: https://github.com/llvm/llvm-project/commit/e9b236f411c5683d270a381bf810ba3c8f3ed12c.diff
LOG: AMDGPU: Check for other defs when folding conditions into s_andn2_b64
We can't fold the masked compare value through the select if the
select condition is re-defed after the and instruction. Fixes a
verifier error and trying to use the outgoing value defined in the
block.
I'm not sure why this pass is bothering to handle physregs. It's
making this more complex and forces extra liveness computation.
Added:
llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
Modified:
llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 8af00fcf62a8..74546befbb59 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -77,6 +77,32 @@ static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
return false;
}
+// See if there is a def between \p AndIdx and \p SelIdx that needs to live
+// beyond \p AndIdx.
+static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx,
+ SlotIndex SelIdx) {
+ LiveQueryResult AndLRQ = LR.Query(AndIdx);
+ return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut());
+}
+
+// FIXME: Why do we bother trying to handle physical registers here?
+static bool isDefBetween(const SIRegisterInfo &TRI,
+ LiveIntervals *LIS, Register Reg,
+ const MachineInstr &Sel, const MachineInstr &And) {
+ SlotIndex AndIdx = LIS->getInstructionIndex(And);
+ SlotIndex SelIdx = LIS->getInstructionIndex(Sel);
+
+ if (Reg.isVirtual())
+ return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
+
+ for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) {
+ if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx))
+ return true;
+ }
+
+ return false;
+}
+
// Optimize sequence
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
// %cmp = V_CMP_NE_U32 1, %1
@@ -158,10 +184,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
Op1->getImm() != 0 || Op2->getImm() != 1)
return AMDGPU::NoRegister;
+ Register CCReg = CC->getReg();
+
+ // If there was a def between the select and the and, we would need to move it
+ // to fold this.
+ if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
+ return AMDGPU::NoRegister;
+
LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
<< *And);
- Register CCReg = CC->getReg();
LIS->RemoveMachineInstrFromMaps(*And);
MachineInstr *Andn2 =
BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
new file mode 100644
index 000000000000..0c62d666e729
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
@@ -0,0 +1,201 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra,si-optimize-exec-masking-pre-ra -o - %s | FileCheck %s
+
+# FIXME: Second run of the pass is a workaround for a bug in
+# -run-pass. The verifier doesn't detect broken LiveIntervals, see bug
+# 46873
+
+
+# Cannot fold this without moving the def of %7 after the and.
+---
+name: no_fold_andn2_select_condition_live_out_phi
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+ ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: S_BRANCH %bb.2
+ ; CHECK: bb.1:
+ ; CHECK: S_ENDPGM 0
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+ ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
+ ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0
+ ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ; CHECK: ATOMIC_FENCE 4, 2
+ ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
+ ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK: S_BRANCH %bb.2
+ bb.0:
+ successors: %bb.2
+
+ %7:sreg_64_xexec = S_MOV_B64 -1
+ undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ S_ENDPGM 0
+
+ bb.2:
+ successors: %bb.1, %bb.2
+
+ %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec
+ V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec
+ %5.sub1:vreg_64 = COPY %5.sub0
+ DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ATOMIC_FENCE 4, 2
+ %7:sreg_64_xexec = S_MOV_B64 0
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ S_BRANCH %bb.2
+
+...
+
+# It's OK to fold this, since the phi def is after the andn2 insert point.
+---
+name: fold_andn2_select_condition_live_out_phi_reorder
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_reorder
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+ ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: S_BRANCH %bb.2
+ ; CHECK: bb.1:
+ ; CHECK: S_ENDPGM 0
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0
+ ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ; CHECK: ATOMIC_FENCE 4, 2
+ ; CHECK: $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc
+ ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+ ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK: S_BRANCH %bb.2
+ bb.0:
+ successors: %bb.2
+
+ %7:sreg_64_xexec = S_MOV_B64 -1
+ undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ S_ENDPGM 0
+
+ bb.2:
+ successors: %bb.1, %bb.2
+
+ %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec
+ V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec
+ %5.sub1:vreg_64 = COPY %5.sub0
+ DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ATOMIC_FENCE 4, 2
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ %7:sreg_64_xexec = S_MOV_B64 0
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ S_BRANCH %bb.2
+
+...
+
+---
+name: no_fold_andn2_select_condition_live_out_phi_physreg
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi_physreg
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1
+ ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: S_BRANCH %bb.2
+ ; CHECK: bb.1:
+ ; CHECK: S_ENDPGM 0
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: liveins: $sgpr4_sgpr5
+ ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec
+ ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
+ ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0
+ ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ; CHECK: ATOMIC_FENCE 4, 2
+ ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0
+ ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
+ ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK: S_BRANCH %bb.2
+ bb.0:
+ successors: %bb.2
+
+ $sgpr4_sgpr5 = S_MOV_B64 -1
+ undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ S_ENDPGM 0
+
+ bb.2:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr4_sgpr5
+
+ %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec
+ V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec
+ %5.sub1:vreg_64 = COPY %5.sub0
+ DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ATOMIC_FENCE 4, 2
+ $sgpr4_sgpr5 = S_MOV_B64 0
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ S_BRANCH %bb.2
+
+...
+
+---
+name: fold_andn2_select_condition_live_out_phi_physreg_reorder
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_physreg_reorder
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1
+ ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: S_BRANCH %bb.2
+ ; CHECK: bb.1:
+ ; CHECK: S_ENDPGM 0
+ ; CHECK: bb.2:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: liveins: $sgpr4_sgpr5
+ ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0
+ ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ; CHECK: ATOMIC_FENCE 4, 2
+ ; CHECK: $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc
+ ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0
+ ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK: S_BRANCH %bb.2
+ bb.0:
+ successors: %bb.2
+
+ $sgpr4_sgpr5 = S_MOV_B64 -1
+ undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ S_ENDPGM 0
+
+ bb.2:
+ successors: %bb.1, %bb.2
+ liveins: $sgpr4_sgpr5
+
+ %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec
+ V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec
+ %5.sub1:vreg_64 = COPY %5.sub0
+ DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
+ ATOMIC_FENCE 4, 2
+ $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+ $sgpr4_sgpr5 = S_MOV_B64 0
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+ S_BRANCH %bb.2
+
+...
More information about the llvm-commits
mailing list