[llvm] [AMDGPU] Always emit SI_KILL_I1_PSEUDO for uniform floating point branches. (PR #124028)
Konstantina Mitropoulou via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 12:25:33 PST 2025
https://github.com/kmitropoulou updated https://github.com/llvm/llvm-project/pull/124028
>From a20f999cbc2ce115355983a806ba5f65adfcf09d Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Wed, 22 Jan 2025 14:30:06 -0800
Subject: [PATCH 1/3] [NFC] Use GCNPat instead of Pat.
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 40a20fa9cb15ea..be3adae7dc905e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1053,39 +1053,39 @@ def : GCNPat<
(SI_ELSE $src, $target)
>;
-def : Pat <
+def : GCNPat <
(int_amdgcn_kill i1:$src),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
>;
-def : Pat <
+def : GCNPat <
(int_amdgcn_kill (i1 (not i1:$src))),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
>;
-def : Pat <
+def : GCNPat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
-def : Pat <
+def : GCNPat <
(int_amdgcn_wqm_demote i1:$src),
(SI_DEMOTE_I1 SCSrc_i1:$src, 0)
>;
-def : Pat <
+def : GCNPat <
(int_amdgcn_wqm_demote (i1 (not i1:$src))),
(SI_DEMOTE_I1 SCSrc_i1:$src, -1)
>;
// TODO: we could add more variants for other types of conditionals
-def : Pat <
+def : GCNPat <
(i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
-def : Pat <
+def : GCNPat <
(i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
>From 9c8a6cd673dd4213db46dfe5570b5c9d47cff6e8 Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Thu, 23 Jan 2025 19:46:36 -0800
Subject: [PATCH 2/3] Add a new test with SI_KILL_F32_COND_IMM_PSEUDO
---
...t_kill_i1_for_floation_point_comparison.ll | 55 +++++++++++++++++++
1 file changed, 55 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
new file mode 100644
index 00000000000000..a16bd16f94f4b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn--amdpal"
+
+define amdgpu_ps void @_amdgpu_ps_main() {
+ ; CHECK-LABEL: name: _amdgpu_ps_main
+ ; CHECK: bb.0..entry:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], killed [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+ ; CHECK-NEXT: SI_KILL_F32_COND_IMM_PSEUDO [[S_BUFFER_LOAD_DWORD_IMM]], 0, 11, implicit-def dead $vcc, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1 (%ir-block.3):
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2 (%ir-block.5):
+ ; CHECK-NEXT: S_ENDPGM 0
+.entry:
+ %0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
+ %1 = bitcast i32 %0 to float
+ %2 = fcmp uge float %1, 0.000000e+00
+ call void @llvm.amdgcn.kill(i1 %2)
+ br i1 %2, label %3, label %8
+
+3: ; preds = %.entry
+ %4 = call i64 @llvm.amdgcn.s.getpc()
+ %5 = and i64 %4, 1
+ %6 = inttoptr i64 %5 to ptr addrspace(4)
+ %7 = getelementptr i8, ptr addrspace(4) %6, i64 32
+ br label %8
+
+8: ; preds = %3, %.entry
+ ret void
+}
+
+; Function Attrs: nocallback nofree nounwind
+declare void @llvm.amdgcn.kill(i1) #0
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i64 @llvm.amdgcn.s.getpc() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #2
+
+attributes #0 = { nocallback nofree nounwind }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
>From 2ae284a396ef9a853ad4d3ead7328eab881234a8 Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <KonstantinaMitropoulou at amd.com>
Date: Wed, 22 Jan 2025 15:13:00 -0800
Subject: [PATCH 3/3] [AMDGPU] Always emit SI_KILL_I1_PSEUDO for uniform
floating point branches.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 3 +++
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 +
.../AMDGPU/set_kill_i1_for_floation_point_comparison.ll | 7 +++++--
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7ad6720b8001af..6439149d801f6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2498,6 +2498,9 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
+def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">,
+ AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>;
+
def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index be3adae7dc905e..ae3d20bcfb3aaf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1063,6 +1063,7 @@ def : GCNPat <
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
>;
+let SubtargetPredicate = NotHasSALUFloatInsts in
def : GCNPat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
diff --git a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
index a16bd16f94f4b6..9de16bd064a4b6 100644
--- a/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
+++ b/llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll
@@ -13,8 +13,10 @@ define amdgpu_ps void @_amdgpu_ps_main() {
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
- ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], killed [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
- ; CHECK-NEXT: SI_KILL_F32_COND_IMM_PSEUDO [[S_BUFFER_LOAD_DWORD_IMM]], 0, 11, implicit-def dead $vcc, implicit $exec
+ ; CHECK-NEXT: nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
+ ; CHECK-NEXT: SI_KILL_I1_PSEUDO killed [[COPY]], 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
@@ -53,3 +55,4 @@ declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #2
attributes #0 = { nocallback nofree nounwind }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
More information about the llvm-commits
mailing list