[llvm] 3f8ef57 - MachineSink: Fix sinking VGPR def out of a divergent loop

Tue Jul 18 03:15:57 PDT 2023

Author: Matt Arsenault
Date: 2023-07-18T06:15:50-04:00
New Revision: 3f8ef57bede94445b1a1042c987cc914a886e7ff

URL: https://github.com/llvm/llvm-project/commit/3f8ef57bede94445b1a1042c987cc914a886e7ff
DIFF: https://github.com/llvm/llvm-project/commit/3f8ef57bede94445b1a1042c987cc914a886e7ff.diff

LOG: MachineSink: Fix sinking VGPR def out of a divergent loop

This fixes sinking a VGPR def out of a loop past the reconvergence
point at the SI_END_CF. There was a prior fix which introduced
blockPrologueInterferes (D121277) to fix the same basic problem for
the post RA sink. This also had the special case isIgnorableUse case
which was incorrect, because in some contexts the exec use is not
ignorable.

I'm thinking about a new way to represent this which will avoid
needing hasIgnorableUse and isBasicBlockPrologue, which would function
more like the exception handling.

Fixes: SWDEV-407790

https://reviews.llvm.org/D155343

Added: 
    

Modified: 
    llvm/lib/CodeGen/MachineSink.cpp
    llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
    llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
    llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 3b0166ee139fc1..8da97dc7e74240 100644

--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -288,8 +288,7 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
       if (!Reg)
         continue;
       if (MO.isUse()) {
-        if (Reg.isPhysical() &&
-            (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
+        if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
           continue;
         if (PI->modifiesRegister(Reg, TRI))
           return true;
@@ -1007,16 +1006,24 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   if (MBB == SuccToSinkTo)
     return nullptr;
 
+  if (!SuccToSinkTo)
+    return nullptr;
+
   // It's not safe to sink instructions to EH landing pad. Control flow into
   // landing pad is implicitly defined.
-  if (SuccToSinkTo && SuccToSinkTo->isEHPad())
+  if (SuccToSinkTo->isEHPad())
     return nullptr;
 
   // It ought to be okay to sink instructions into an INLINEASM_BR target, but
   // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
   // the source block (which this code does not yet do). So for now, forbid
   // doing so.
-  if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
+  if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
+    return nullptr;
+
+  MachineBasicBlock::const_iterator InsertPos =
+      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
+  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
     return nullptr;
 
   return SuccToSinkTo;

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index b8e74bc7db09a1..e2456b74f7ef1f 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -21,7 +21,6 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:  .LBB0_1: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:  .LBB0_2: ; %Flow1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
@@ -54,6 +53,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, s9, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s4, v4, v0
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_1

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index 037a285794120d..cc14b4a80d58a7 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -42,6 +42,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[SI_IF1]], [[SI_IF]], implicit-def dead $scc
   ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
@@ -51,7 +52,6 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.4
   ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[V_ADD_U32_e64_]]
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
index 4feef2149b4224..ee3d7aeb454f96 100644
--- a/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/sink-after-control-flow.mir
@@ -17,6 +17,7 @@ body:             |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8
+  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[DEF]], 8, 5, implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
@@ -37,7 +38,6 @@ body:             |
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   $exec_lo = S_OR_B32 $exec_lo, [[S_XOR_B32_1]], implicit-def $scc
-  ; GFX10-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[S_MOV_B32_]], [[DEF]], implicit $exec
   ; GFX10-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 31
   ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[V_BFE_U32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
   ; GFX10-NEXT:   [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_1]], -1, implicit-def $scc