[llvm] [AMDGCN][SIWholeQuadMode] Handle case when SI_KILL_I1_TERMINATOR -1,0 is not the only terminator (PR #122922)

Wed Jan 29 07:34:26 PST 2025

https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/122922

>From b725f61b33e8b9a4adb014fa9e559497b041530a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti at amd.com>
Date: Tue, 14 Jan 2025 15:11:41 +0100
Subject: [PATCH] [AMDGCN][SIWholeQuadMode] Rework
 splitBlock/lowerKillI1/lowerKillF32 to handle case when SI_KILL_I1_TERMINATOR
 -1 0 is not the unique terminator

The lowerKillI1 method wrongly handled cases where it inserted a
new S_BRANCH instruction when the kill was not the only terminator,
and then tried to split the block.

`SI_KILL_I1_TERMINATOR -1,0` doesn't have any effect. Instead of
lowering to an unconditional branch, we remove the instruction and
insert an unconditional branch only if the instruction is the last
terminator. No split is needed in this case (if the last terminator
has been reached, then the whole block was processed).

Also stop generating an unconditional branch in splitBlock: this
branch was redundant since TermMI is promoted to a
terminator that fallsthrough to the next block already.

Solves SWDEV-508819
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    | 37 ++++++++---------
 .../AMDGPU/kill-true-in-return-block.ll       | 41 +++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  4 +-
 3 files changed, 61 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 87eb6d9e385d46..bfa9ce82e2c192 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -768,11 +768,19 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
   case AMDGPU::S_MOV_B64:
     NewOpcode = AMDGPU::S_MOV_B64_term;
     break;
-  default:
+  case AMDGPU::S_ANDN2_B32:
+    NewOpcode = AMDGPU::S_ANDN2_B32_term;
+    break;
+  case AMDGPU::S_ANDN2_B64:
+    NewOpcode = AMDGPU::S_ANDN2_B64_term;
     break;
+  default:
+    llvm_unreachable("Unexpected instruction");
   }
-  if (NewOpcode)
-    TermMI->setDesc(TII->get(NewOpcode));
+
+  // these terminators fallthrough to the next block, no need to add an
+  // unconditional branch to the next block (SplitBB)
+  TermMI->setDesc(TII->get(NewOpcode));
 
   if (SplitBB != BB) {
     // Update dominator trees
@@ -787,12 +795,6 @@ void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
       MDT->applyUpdates(DTUpdates);
     if (PDT)
       PDT->applyUpdates(DTUpdates);
-
-    // Link blocks
-    MachineInstr *MI =
-        BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
-            .addMBB(SplitBB);
-    LIS->InsertMachineInstrInMaps(*MI);
   }
 }
 
@@ -901,8 +903,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
 
   assert(MBB.succ_size() == 1);
-  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
-                              .addMBB(*MBB.succ_begin());
 
   // Update live intervals
   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
@@ -911,9 +911,8 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
-  LIS->InsertMachineInstrInMaps(*NewTerm);
 
-  return NewTerm;
+  return ExecMaskMI;
 }
 
 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
@@ -940,17 +939,17 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
                          .addReg(Exec);
     } else {
       // Static: kill does nothing
-      MachineInstr *NewTerm = nullptr;
-      if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+      bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
+      if (!IsLastTerminator) {
         LIS->RemoveMachineInstrFromMaps(MI);
       } else {
-        assert(MBB.succ_size() == 1);
-        NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
-                      .addMBB(*MBB.succ_begin());
+        assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
+        MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+                                    .addMBB(*MBB.succ_begin());
         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
       }
       MBB.remove(&MI);
-      return NewTerm;
+      return nullptr;
     }
   } else {
     if (!KillVal) {
diff --git a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
new file mode 100644
index 00000000000000..021c845d5ea6bb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s
+
+define amdgpu_ps float @kill_true(i1 %.not) {
+; CHECK-LABEL: kill_true:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[0:1], exec
+; CHECK-NEXT:    s_wqm_b64 exec, exec
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %if1
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $exec
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-NEXT:  .LBB0_2: ; %endif1
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ; return to shader part epilog
+entry:
+  br i1 %.not, label %endif1, label %if1
+
+if1:
+  %C = call float @llvm.amdgcn.wqm.f32(float 0.000000e+00)
+  store float %C, ptr null, align 4
+  br label %endif1
+
+endif1:
+  call void @llvm.amdgcn.kill(i1 true)
+  ret float 0.000000e+00
+}
+
+declare void @llvm.amdgcn.kill(i1)
+
+declare float @llvm.amdgcn.wqm.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index deab4075818805..e439cf80e49241 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3361,7 +3361,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
 ; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
-; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
+; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB55_1
 ; GFX9-W64-NEXT:    s_endpgm
 ; GFX9-W64-NEXT:  .LBB55_1:
@@ -3377,7 +3377,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
 ; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
-; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
+; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB55_1
 ; GFX10-W32-NEXT:    s_endpgm
 ; GFX10-W32-NEXT:  .LBB55_1: