[llvm] AMDGPU: fix isSafeToSink expecting exactly one predecessor (PR #89224)

Mon Apr 22 04:15:47 PDT 2024

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/89224

>From 400f0eb0907799b022ca6ede2d7a94ef64fdc408 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Mon, 22 Apr 2024 13:14:42 +0200
Subject: [PATCH] AMDGPU: fix isSafeToSink expecting exactly one predecessor

isSafeToSink needs to check if machine cycle has divergent exit branch
but first it needs the MBB that contains cycle exit branch.
Machine cycle has information of cycle exit blocks (blocks that are
destinations of cycle exit branches).
After structurize-cfg, cycle should have exactly one exit block and
exit block should have exactly one predecessor.
Early-tailduplication can delete exit block created by structurize-cfg
so there is still exactly one cycle exit block but the new cycle exit
block can have multiple predecessors. We need predecessor that belongs
to the cycle.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 15 +--
 .../AMDGPU/GlobalISel/is-safe-to-sink-bug.ll  | 93 +++++++++++++++++++
 2 files changed, 102 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f4b21b7dfac391..d7c5aba4141118 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -213,17 +213,20 @@ bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
       // Check if there is a FromCycle that contains SgprDef's basic block but
       // does not contain SuccToSinkTo and also has divergent exit condition.
       while (FromCycle && !FromCycle->contains(ToCycle)) {
-        // After structurize-cfg, there should be exactly one cycle exit.
+        // After structurize-cfg, there should be exactly one cycle exit. Also,
+        // cycle exit block should have exactly one predecessor, the cycle exit.
+        // Early-tailduplication can removed that block so we have to search for
+        // predecessor that is in cycle.
         SmallVector<MachineBasicBlock *, 1> ExitBlocks;
         FromCycle->getExitBlocks(ExitBlocks);
-        assert(ExitBlocks.size() == 1);
-        assert(ExitBlocks[0]->getSinglePredecessor());
 
         // FromCycle has divergent exit condition.
-        if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
-          return false;
+        for (MachineBasicBlock *ExitBlock : ExitBlocks) {
+          for (MachineBasicBlock *Pred : ExitBlock->predecessors()) {
+            if (FromCycle->contains(Pred) && hasDivergentBranch(Pred))
+              return false;
+          }
         }
-
         FromCycle = FromCycle->getParentCycle();
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
new file mode 100644
index 00000000000000..d3bc661f5940b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -verify-machineinstrs < %s | FileCheck %s
+
+; early-tailduplication deletes cycle exit block created by structurize-cfg
+; that had exactly one predecessor. Now, new cycle exit block has two
+; predecessors, we need to find predecessor that belongs to the cycle.
+
+define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
+; CHECK-LABEL: _amdgpu_ps_main:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_buffer_load_dword s1, s[4:7], 0x0
+; CHECK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s10, -1
+; CHECK-NEXT:    s_mov_b32 s11, 0x31c16000
+; CHECK-NEXT:    s_add_u32 s8, s8, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmp_ge_i32 s1, 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; %bb12
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], 0
+; CHECK-NEXT:  .LBB0_2: ; %bb2.preheader
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    s_branch .LBB0_4
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_3: ; %bb6
+; CHECK-NEXT:    ; in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; CHECK-NEXT:    s_and_b32 s2, 1, s2
+; CHECK-NEXT:    v_or_b32_e32 v1, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    s_and_b32 s4, s2, s1
+; CHECK-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; CHECK-NEXT:    s_and_b32 s2, exec_lo, s4
+; CHECK-NEXT:    s_or_b32 s1, s1, s2
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
+; CHECK-NEXT:  .LBB0_4: ; %bb2
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_and_saveexec_b32 s3, s0
+; CHECK-NEXT:    s_cbranch_execz .LBB0_3
+; CHECK-NEXT:  ; %bb.5: ; %bb5
+; CHECK-NEXT:    ; in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    s_mov_b32 s2, 1
+; CHECK-NEXT:    s_branch .LBB0_3
+bb:
+  %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
+  %i1 = icmp slt i32 %i, 0
+  br i1 %i1, label %bb2, label %bb12
+
+bb2:
+  %i3 = phi i1 [ %i9, %bb6 ], [ false, %bb ]
+  %i4 = phi i32 [ %i10, %bb6 ], [ 0, %bb ]
+  br i1 %arg, label %bb5, label %bb6
+
+bb5:
+  br label %bb6
+
+bb6:
+  %i7 = phi i32 [ 0, %bb2 ], [ 1, %bb5 ]
+  %i8 = icmp ne i32 %i7, 0
+  %i9 = select i1 %i8, i1 %i3, i1 false
+  %i10 = or i32 %i4, 1
+  %i11 = icmp slt i32 %i4, 0
+  br i1 %i11, label %bb2, label %bb12
+
+bb12:
+  %i13 = phi i1 [ false, %bb ], [ %i9, %bb6 ]
+  %i14 = select i1 %i13, float 0.000000e+00, float 1.000000e+00
+  %i15 = insertelement <4 x float> zeroinitializer, float %i14, i64 0
+  call amdgpu_gfx addrspace(4) void null(<4 x float> %i15, i32 0)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)