[llvm] [AMDGPU] Always emit SI_KILL_I1_PSEUDO for uniform floating point branches. (PR #124028)

Wed Jan 29 08:15:00 PST 2025

https://github.com/kmitropoulou updated https://github.com/llvm/llvm-project/pull/124028

>From b72520020c1261180bc847438f9eeea23bcf89b4 Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Wed, 22 Jan 2025 14:30:06 -0800
Subject: [PATCH 1/3] [NFC] Use GCNPat instead of Pat.

---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index aa81d9b7e22a7e..8f4d74d4a2afbc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1053,39 +1053,39 @@ def : GCNPat<
   (SI_ELSE $src, $target)
 >;
 
-def : Pat <
+def : GCNPat <
   (int_amdgcn_kill i1:$src),
   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
 >;
 
-def : Pat <
+def : GCNPat <
   (int_amdgcn_kill (i1 (not i1:$src))),
   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
 >;
 
-def : Pat <
+def : GCNPat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
 
-def : Pat <
+def : GCNPat <
   (int_amdgcn_wqm_demote i1:$src),
   (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
 >;
 
-def : Pat <
+def : GCNPat <
   (int_amdgcn_wqm_demote (i1 (not i1:$src))),
   (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
 >;
 
   // TODO: we could add more variants for other types of conditionals
 
-def : Pat <
+def : GCNPat <
   (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
   (COPY $src) // Return the SGPRs representing i1 src
 >;
 
-def : Pat <
+def : GCNPat <
   (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
   (COPY $src) // Return the SGPRs representing i1 src
 >;

>From 3f19ceb3bcd87289d01c40e90c578bb6195fbd12 Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Thu, 23 Jan 2025 19:46:36 -0800
Subject: [PATCH 2/3] Add a new test with SI_KILL_F32_COND_IMM_PSEUDO

---
 ...t_kill_i1_for_floation_point_comparison.ll | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll

diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
new file mode 100644
index 00000000000000..84afbde2877f56
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck %s
+
+define amdgpu_ps void @_amdgpu_ps_main() {
+  ; CHECK-LABEL: name: _amdgpu_ps_main
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], killed [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   SI_KILL_F32_COND_IMM_PSEUDO [[S_BUFFER_LOAD_DWORD_IMM]], 0, 11, implicit-def dead $vcc, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+entry:
+  %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
+  %i1 = bitcast i32 %i to float
+  %i2 = fcmp uge float %i1, 0.000000e+00
+  call void @llvm.amdgcn.kill(i1 %i2)
+  br i1 %i2, label %bb1, label %bb2
+
+bb1:                                               ; preds = %entry
+  %i3 = call i64 @llvm.amdgcn.s.getpc()
+  %i4 = and i64 %i3, 1
+  %i5 = inttoptr i64 %i4 to ptr addrspace(4)
+  %i6 = getelementptr i8, ptr addrspace(4) %i5, i64 32
+  br label %bb2
+
+bb2:                                              ; preds = %bb, %entry
+  ret void
+}

>From 26ef9ed0870b218b3020269633da3fd1cc84bbee Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Wed, 22 Jan 2025 15:13:00 -0800
Subject: [PATCH 3/3] [AMDGPU] Always emit SI_KILL_I1_PSEUDO for uniform
 floating point branches.

---
 llvm/lib/Target/AMDGPU/AMDGPU.td                            | 3 +++
 llvm/lib/Target/AMDGPU/SIInstructions.td                    | 1 +
 .../AMDGPU/set_kill_i1_for_floation_point_comparison.ll     | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7ad6720b8001af..6439149d801f6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2498,6 +2498,9 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
 def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
   AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
 
+def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">,
+  AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>;
+
 def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
   AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8f4d74d4a2afbc..5af46989aca97b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1063,6 +1063,7 @@ def : GCNPat <
   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
 >;
 
+let SubtargetPredicate = NotHasSALUFloatInsts in
 def : GCNPat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
index 84afbde2877f56..5f101c360f148c 100644
--- a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -10,8 +10,10 @@ define amdgpu_ps void @_amdgpu_ps_main() {
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], killed [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
-  ; CHECK-NEXT:   SI_KILL_F32_COND_IMM_PSEUDO [[S_BUFFER_LOAD_DWORD_IMM]], 0, 11, implicit-def dead $vcc, implicit $exec
+  ; CHECK-NEXT:   nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
+  ; CHECK-NEXT:   SI_KILL_I1_PSEUDO killed [[COPY]], 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}