[llvm] [AMDGPU] Fix SIFixSGPRCopies handling of STRICT_WWM and friends (PR #142122)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri May 30 03:59:54 PDT 2025


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/142122

SIFixSGPRCopies handled STRICT_WWM (and similar WWM/WQM pseudos) like a
COPY. In particular, if the source was a VGPR and the result was an
SGPR, lowerVGPR2SGPRCopies would replace it with a readfirstlane,
erasing the original pseudo and hence sabotaging the WWM region marking
which is supposed to be performed by SIWholeQuadMode.

Fix this by handling it more like INSERT_SUBREG, PHI and REG_SEQUENCE:
if the source is a VGPR then move the result to a VGPR, and keep the
pseudo.


>From f3971129f5d5957be46faad42c34b50716d002a1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 30 May 2025 11:42:35 +0100
Subject: [PATCH 1/2] precommit tests

---
 .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll     | 38 +++++++++++++++++++
 .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir    | 26 +++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir

diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
new file mode 100644
index 0000000000000..59a62e9900623
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_gs i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_bitcmp1_b32 0, 0
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; CHECK-NEXT:    v_readfirstlane_b32 s1, v0
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_xor_b32 s0, s0, -1
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    s_wait_alu 0xf1ff
+; CHECK-NEXT:    ; return to shader part epilog
+bb:
+  %i = call i1 @llvm.amdgcn.readfirstlane.i1(i1 false)
+  br label %bb1
+
+bb1:
+  %i2 = zext i1 %i to i32
+  %i3 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
+  %i4 = call i32 @llvm.amdgcn.wwm.i32(i32 %i2)
+  %i5 = trunc i32 %i4 to i1
+  %i6 = trunc i32 %i3 to i1
+  %i7 = or i1 %i6, %i5
+  %i8 = select i1 %i7, i32 0, i32 1
+  ret i32 %i8
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir
new file mode 100644
index 0000000000000..b4e407bbcc158
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies %s | FileCheck %s
+
+---
+name: main
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: main
+    ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed undef [[DEF]], killed undef [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed undef [[S_OR_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: S_CMP_EQ_U32 killed undef [[S_AND_B32_]], 1, implicit-def $scc
+    ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 killed undef [[S_AND_B32_]], killed undef [[S_AND_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG undef $sgpr0
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    early-clobber %2:sreg_32 = STRICT_WWM killed undef %1, implicit $exec
+    %3:sreg_32 = S_OR_B32 killed undef %0, killed undef %2, implicit-def dead $scc
+    %4:sreg_32 = S_AND_B32 1, killed undef %3, implicit-def dead $scc
+    S_CMP_EQ_U32 killed undef %4, 1, implicit-def $scc
+    %5:sreg_32_xm0_xexec = S_XOR_B32 killed undef %4, killed undef %4, implicit-def dead $scc
+    SI_RETURN_TO_EPILOG undef $sgpr0
+...

>From 9df6331b9f3cd924b93fea845a50f03932506dff Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 30 May 2025 11:41:57 +0100
Subject: [PATCH 2/2] [AMDGPU] Fix SIFixSGPRCopies handling of STRICT_WWM and
 friends

SIFixSGPRCopies handled STRICT_WWM (and similar WWM/WQM pseudos) like a
COPY. In particular, if the source was a VGPR and the result was an
SGPR, lowerVGPR2SGPRCopies would replace it with a readfirstlane,
erasing the original pseudo and hence sabotaging the WWM region marking
which is supposed to be performed by SIWholeQuadMode.

Fix this by handling it more like INSERT_SUBREG, PHI and REG_SEQUENCE:
if the source is a VGPR then move the result to a VGPR, and keep the
pseudo.
---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp       | 10 +++++-----
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll  |  8 +++++---
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir |  5 +++--
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 2cf00b4e5cc66..1bf5b4a241780 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -634,11 +634,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default:
         continue;
-      case AMDGPU::COPY:
-      case AMDGPU::WQM:
-      case AMDGPU::STRICT_WQM:
-      case AMDGPU::SOFT_WQM:
-      case AMDGPU::STRICT_WWM: {
+      case AMDGPU::COPY: {
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
@@ -662,6 +658,10 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
 
         break;
       }
+      case AMDGPU::WQM:
+      case AMDGPU::STRICT_WQM:
+      case AMDGPU::SOFT_WQM:
+      case AMDGPU::STRICT_WWM:
       case AMDGPU::INSERT_SUBREG:
       case AMDGPU::PHI:
       case AMDGPU::REG_SEQUENCE: {
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index 59a62e9900623..db32135939a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -7,9 +7,11 @@ define amdgpu_gs i32 @main() {
 ; CHECK-NEXT:    s_bitcmp1_b32 0, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_or_saveexec_b32 s2, -1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; CHECK-NEXT:    v_readfirstlane_b32 s1, v0
+; CHECK-NEXT:    s_mov_b32 exec_lo, s2
 ; CHECK-NEXT:    s_or_b32 s0, s0, s1
 ; CHECK-NEXT:    s_wait_alu 0xfffe
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 0
@@ -17,9 +19,9 @@ define amdgpu_gs i32 @main() {
 ; CHECK-NEXT:    s_wait_alu 0xfffe
 ; CHECK-NEXT:    s_xor_b32 s0, s0, -1
 ; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v1
 ; CHECK-NEXT:    s_wait_alu 0xf1ff
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir
index b4e407bbcc158..7c0d5ed205ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.mir
@@ -9,8 +9,9 @@ body: |
     ; CHECK-LABEL: name: main
     ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[DEF1]], implicit $exec
-    ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed undef [[DEF]], killed undef [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[DEF1]], implicit $exec
+    ; CHECK-NEXT: early-clobber %2:sreg_32 = STRICT_WWM killed undef [[V_READFIRSTLANE_B32_]], implicit $exec
+    ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed undef [[DEF]], killed undef %2, implicit-def dead $scc
     ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed undef [[S_OR_B32_]], implicit-def dead $scc
     ; CHECK-NEXT: S_CMP_EQ_U32 killed undef [[S_AND_B32_]], 1, implicit-def $scc
     ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 killed undef [[S_AND_B32_]], killed undef [[S_AND_B32_]], implicit-def dead $scc



More information about the llvm-commits mailing list