[llvm] [AMDGPU] si-peephole-sdwa: Disable V_CNDMASK_B32 conversion with sext (PR #140760)

Frederik Harwath via llvm-commits llvm-commits at lists.llvm.org
Sun May 25 23:38:58 PDT 2025


https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/140760

>From a035402e9ff56cb3b13562d3e7775087e273b32c Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 20 May 2025 11:52:13 -0400
Subject: [PATCH 1/6] [AMDGPU] si-peephole-sdwa: Disable V_CNDMASK conversion
 with sext

The sext modifier on an operand of V_CNDMASK_B32_sdwa gets erroneously
turned into a neg modifier in the assembly output.

As a workaround, to avoid miscompilation, this patch disables the
conversion of V_CNDMASK_B32 to the SDWA form if any operand uses
an sext modifier.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     |  7 +++
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll      | 47 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bd8baaaa3df20..70d448e75eb1a 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -430,6 +430,13 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
     // Does not support input modifiers: noabs, noneg, nosext.
     return false;
+  case AMDGPU::V_CNDMASK_B32_sdwa:
+    // FIXME SISrcMods uses the same bitmask for SEXT and NEG
+    // modifiers and hence each instruction can only support one type
+    // of modifier; SEXT gets turned into NEG for this instruction.
+    if (Sext)
+      return false;
+    break;
   }
 
   // Find operand in instruction that matches source operand and replace it with
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
new file mode 100644
index 0000000000000..4a6c4ae5e6c02
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+; XFAIL: *
+
+; FIXME The sext modifier is turned into a neg modifier in the asm output
+
+define void @widget(ptr addrspace(7) %arg, <1 x i1> %arg1, <1 x i1> %arg2) #0 {
+; CHECK-LABEL: widget:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, s0
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s3, s0
+; CHECK-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; CHECK-NEXT:    v_and_b32_e32 v1, 1, v5
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:  ; %bb.1: ; %cond.load
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    ds_read_b32 v1, v1
+; CHECK-NEXT:  ; %bb.2: ; %else
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_4
+; CHECK-NEXT:  ; %bb.3: ; %cond.store
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:  .LBB0_4: ; %else1
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %load = load <1 x i8>, ptr addrspace(7) null, align 1
+  %sext = sext <1 x i8> %load to <1 x i32>
+  %select = select <1 x i1> %arg1, <1 x i32> %sext, <1 x i32> zeroinitializer
+  %call = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) null, i32 1, <1 x i1> %arg1, <1 x i32> zeroinitializer)
+  %or = or <1 x i32> %select, %call
+  tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %or, ptr addrspace(3) null, i32 1, <1 x i1> %arg1)
+  tail call void @llvm.amdgcn.s.waitcnt(i32 0)
+  ret void
+}

>From 6761e1b4bbf548eb4e1970e54f3ea1aa12582c5f Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 21 May 2025 02:15:12 -0400
Subject: [PATCH 2/6] Extend comment on SISrcMods and make FIXME more explicit

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 70d448e75eb1a..1e305c2efc8a0 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -431,9 +431,17 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
     // Does not support input modifiers: noabs, noneg, nosext.
     return false;
   case AMDGPU::V_CNDMASK_B32_sdwa:
-    // FIXME SISrcMods uses the same bitmask for SEXT and NEG
-    // modifiers and hence each instruction can only support one type
-    // of modifier; SEXT gets turned into NEG for this instruction.
+    // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
+    // hence the compiler can only support one type of modifier for
+    // each SDWA instruction.  For V_CNDMASK_B32_sdwa, this is NEG
+    // since its operands get printed using
+    // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
+    // the output intended for NEG if SEXT is set.
+    //
+    // The ISA does actually support both modifiers on most SDWA
+    // instructions.
+    //
+    // FIXME Accept SEXT here after fixing this issue.
     if (Sext)
       return false;
     break;

>From e3d9f8273d8e2fc3ac163078a7b590b73dc71bd4 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 21 May 2025 03:15:15 -0400
Subject: [PATCH 3/6] Simplify test

---
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll      | 46 ++++---------------
 1 file changed, 10 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
index 4a6c4ae5e6c02..47dc0335e6f05 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -1,47 +1,21 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
 ; XFAIL: *
 
 ; FIXME The sext modifier is turned into a neg modifier in the asm output
 
-define void @widget(ptr addrspace(7) %arg, <1 x i1> %arg1, <1 x i1> %arg2) #0 {
-; CHECK-LABEL: widget:
-; CHECK:       ; %bb.0: ; %bb
+define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
+; CHECK-LABEL: test_select_on_sext_sdwa:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
-; CHECK-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; CHECK-NEXT:    v_and_b32_e32 v1, 1, v5
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; CHECK-NEXT:  ; %bb.1: ; %cond.load
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    ds_read_b32 v1, v1
-; CHECK-NEXT:  ; %bb.2: ; %else
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; CHECK-NEXT:    s_cbranch_execz .LBB0_4
-; CHECK-NEXT:  ; %bb.3: ; %cond.store
+; CHECK-NEXT:    v_and_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
-; CHECK-NEXT:  .LBB0_4: ; %else1
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-bb:
-  %load = load <1 x i8>, ptr addrspace(7) null, align 1
-  %sext = sext <1 x i8> %load to <1 x i32>
-  %select = select <1 x i1> %arg1, <1 x i32> %sext, <1 x i32> zeroinitializer
-  %call = tail call <1 x i32> @llvm.masked.load.v1i32.p3(ptr addrspace(3) null, i32 1, <1 x i1> %arg1, <1 x i32> zeroinitializer)
-  %or = or <1 x i32> %select, %call
-  tail call void @llvm.masked.store.v1i32.p3(<1 x i32> %or, ptr addrspace(3) null, i32 1, <1 x i1> %arg1)
-  tail call void @llvm.amdgcn.s.waitcnt(i32 0)
-  ret void
+  %sext = sext i8 %x to i32
+  %select = select i1 %cond, i32 %sext, i32 zeroinitializer
+  %or = or i32 %select, %y
+  ret i32 %or
 }

>From f6d7513b483be8a9c3f545e2baef25224db70477 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik at harwath.name>
Date: Wed, 21 May 2025 15:40:56 +0200
Subject: [PATCH 4/6] Update
 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
index 47dc0335e6f05..1d61dad704aec 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -15,7 +15,7 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sext = sext i8 %x to i32
-  %select = select i1 %cond, i32 %sext, i32 zeroinitializer
+  %select = select i1 %cond, i32 %sext, i32 0
   %or = or i32 %select, %y
   ret i32 %or
 }

>From f7b8411d4bf3ba17edf3dca18bc41fc5c177200f Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 21 May 2025 10:01:32 -0400
Subject: [PATCH 5/6] Move test case into existing test file

---
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll      | 21 ----------
 .../AMDGPU/sdwa-peephole-cndmask-wave64.mir   | 38 +++++++++++++++++++
 2 files changed, 38 insertions(+), 21 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
deleted file mode 100644
index 1d61dad704aec..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
-; XFAIL: *
-
-; FIXME The sext modifier is turned into a neg modifier in the asm output
-
-define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
-; CHECK-LABEL: test_select_on_sext_sdwa:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v2, 1, v2
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_cndmask_b32_sdwa v0, v2, sext(v0), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  %sext = sext i8 %x to i32
-  %select = select i1 %cond, i32 %sext, i32 0
-  %or = or i32 %select, %y
-  ret i32 %or
-}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir
index e243df4077ff4..52c06952aa9fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir
@@ -231,3 +231,41 @@ body:             |
     $vgpr0 = COPY %3
     SI_RETURN implicit $vgpr0
 ...
+
+# SDWA conversion of V_CNDMASK_B32 with V_BFE_I32 operand had to be
+# disabled.
+# FIXME sext modifier gets erroneously printed as neg modifier.
+
+...
+---
+name: issue138766_cndmask_with_sext
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: issue138766_cndmask_with_sext
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY2]], 0, 8, implicit $exec
+    ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]]
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_BFE_I32_e64_]], implicit $vcc, implicit $exec
+    ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[V_CNDMASK_B32_e32_]], [[COPY1]], implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %10:vgpr_32 = COPY $vgpr2
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %11:vgpr_32 = V_AND_B32_e64 1, %10, implicit $exec
+    %12:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %11, 1, implicit $exec
+    %14:vgpr_32 = V_BFE_I32_e64 %8, 0, 8, implicit $exec
+    %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %14, killed %12, implicit $exec
+    %18:vgpr_32 = V_OR_B32_e64 killed %16, %9, implicit $exec
+    $vgpr0 = COPY %18
+    SI_RETURN implicit $vgpr0
+...

>From 0a887bfa26c7362b97b93f604bb3f55d6d8b1d59 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 26 May 2025 06:34:54 +0000
Subject: [PATCH 6/6] Reintroduce IR test

---
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll      | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
new file mode 100644
index 0000000000000..2c7819a395c86
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+; FIXME The sext modifier is turned into a neg modifier in the asm output
+
+define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
+; CHECK-LABEL: test_select_on_sext_sdwa:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sext = sext i8 %x to i32
+  %select = select i1 %cond, i32 %sext, i32 0
+  %or = or i32 %select, %y
+  ret i32 %or
+}



More information about the llvm-commits mailing list