[llvm] [AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32 (PR #139541)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Mon May 12 08:09:46 PDT 2025
https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/139541
>From d32f06051c3538c6bca47cf31d818626f66faa81 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 12 May 2025 05:39:17 -0400
Subject: [PATCH 1/5] [AMDGPU] Add tests that demonstrates si-peephole-sdwa
failure on V_CNDMASK
---
.../AMDGPU/sdwa-peephole-cndmask-fail.ll | 51 +++++++++++++++++++
1 file changed, 51 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
new file mode 100644
index 0000000000000..1f7706b8f16c3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
@@ -0,0 +1,51 @@
+; RUN: llc %s -march=amdgcn -mcpu=gfx1030 -o - 2>&1 | FileCheck %s
+; XFAIL: *
+
+; V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, but the
+; expected conversion to SDWA does not occur. FIXME This leads to a
+; compilation error, because the use of $vcc in the resulting
+; instruction must be fixed to $vcc_lo for wave32. This only happens
+; after the full conversion to SDWA.
+
+
+; CHECK-NOT: {{.*}}V_CNDMASK_B32_e32{{.*}}$vcc
+; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses
+
+; ModuleID = 'test.ll'
+source_filename = "test.ll"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) #0 {
+bb:
+ br i1 %arg1, label %bb9, label %bb3
+
+bb3: ; preds = %bb
+ %call = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %mul = mul i32 %call, 5
+ %zext = zext i32 %mul to i64
+ %getelementptr = getelementptr i8, ptr addrspace(1) null, i64 %zext
+ %getelementptr4 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 4
+ %load = load i8, ptr addrspace(1) %getelementptr4, align 1
+ %getelementptr5 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 3
+ %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1
+ %insertelement = insertelement <5 x i8> poison, i8 %load, i64 4
+ %select = select i1 %arg2, <5 x i8> %insertelement, <5 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0>
+ %insertelement7 = insertelement <5 x i8> %select, i8 %load6, i64 0
+ %icmp = icmp ult i32 0, %arg
+ %select8 = select i1 %icmp, <5 x i8> zeroinitializer, <5 x i8> %insertelement7
+ %shufflevector = shufflevector <5 x i8> zeroinitializer, <5 x i8> %select8, <5 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9>
+ br label %bb9
+
+bb9: ; preds = %bb3, %bb
+ %phi = phi <5 x i8> [ %shufflevector, %bb3 ], [ zeroinitializer, %bb ]
+ %extractelement = extractelement <5 x i8> %phi, i64 0
+ store i8 %extractelement, ptr addrspace(1) null, align 1
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { "target-cpu"="gfx1030" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1030" }
>From 85e8efa7964ba13a4de88d17c1230186a055a5b0 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 12 May 2025 07:16:09 -0400
Subject: [PATCH 2/5] [AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 1 +
llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 8 ++++----
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 8eb1d7253cd48..bd8baaaa3df20 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1105,6 +1105,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
.setMIFlags(MI.getFlags());
+ TII->fixImplicitOperands(*Converted);
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
(void)Converted;
MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
index 1f7706b8f16c3..9ab5a31b52441 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
@@ -1,15 +1,15 @@
; RUN: llc %s -march=amdgcn -mcpu=gfx1030 -o - 2>&1 | FileCheck %s
-; XFAIL: *
-; V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, but the
-; expected conversion to SDWA does not occur. FIXME This leads to a
+; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32,
+; but the expected conversion to SDWA does not occur. This led to a
; compilation error, because the use of $vcc in the resulting
-; instruction must be fixed to $vcc_lo for wave32. This only happens
+; instruction must be fixed to $vcc_lo for wave32 which only happened
; after the full conversion to SDWA.
; CHECK-NOT: {{.*}}V_CNDMASK_B32_e32{{.*}}$vcc
; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses
+; CHECK: {{.*}}v_cndmask_b32_e32{{.*}}vcc_lo
; ModuleID = 'test.ll'
source_filename = "test.ll"
>From dc740cea3c32fd9cfe99301f3535195beff04b76 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 12 May 2025 08:20:43 -0400
Subject: [PATCH 3/5] Clean up test
---
.../CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 15 ++-------------
1 file changed, 2 insertions(+), 13 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
index 9ab5a31b52441..c3f1f1cf7950c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
@@ -11,12 +11,7 @@
; CHECK-NOT: {{.*}}Bad machine code: Virtual register defs don't dominate all uses
; CHECK: {{.*}}v_cndmask_b32_e32{{.*}}vcc_lo
-; ModuleID = 'test.ll'
-source_filename = "test.ll"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
-define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) #0 {
+define amdgpu_kernel void @quux(i32 %arg, i1 %arg1, i1 %arg2) {
bb:
br i1 %arg1, label %bb9, label %bb3
@@ -42,10 +37,4 @@ bb9: ; preds = %bb3, %bb
%extractelement = extractelement <5 x i8> %phi, i64 0
store i8 %extractelement, ptr addrspace(1) null, align 1
ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
-
-attributes #0 = { "target-cpu"="gfx1030" }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1030" }
+}
\ No newline at end of file
>From 6f2426344000352914625af58caa24f650a5b643 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik at harwath.name>
Date: Mon, 12 May 2025 14:46:51 +0200
Subject: [PATCH 4/5] Update
llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
index c3f1f1cf7950c..e95778a1e1759 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
@@ -37,4 +37,4 @@ bb9: ; preds = %bb3, %bb
%extractelement = extractelement <5 x i8> %phi, i64 0
store i8 %extractelement, ptr addrspace(1) null, align 1
ret void
-}
\ No newline at end of file
+}
>From 4b52ec535329ab21d10660d5e78ae1bfc75afbcd Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 12 May 2025 11:07:33 -0400
Subject: [PATCH 5/5] Add mir test
---
.../AMDGPU/sdwa-peephole-cndmask-wave32.mir | 89 +++++++++++++++++++
1 file changed, 89 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
index 4b45c54a3b83d..34a2c8735a7cb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
@@ -230,3 +230,92 @@ body: |
$vgpr0 = COPY %3
SI_RETURN implicit $vgpr0
...
+
+---
+name: cndmask-not-converted
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: cndmask-not-converted
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0
+ ; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc
+ ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc
+ ; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_]]
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc
+ ; CHECK-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+ ; CHECK-NEXT: $vcc_lo = COPY [[S_CSELECT_B32_1]]
+ ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec
+ ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1
+ ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $vgpr0, $sgpr8_sgpr9
+
+ %0:sgpr_64(p4) = COPY $sgpr8_sgpr9
+ %1:vgpr_32(s32) = COPY $vgpr0
+ %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
+ S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc
+ %3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+ %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc
+ $vcc_lo = COPY %5
+ S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %6:sreg_64 = COPY %2
+ %7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec
+ %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1
+ %10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec
+ %11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec
+ %12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec
+ S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc
+ %13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+ %14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec
+ %15:vgpr_32 = V_LSHRREV_B32_e64 24, %14, implicit $exec
+ %16:vgpr_32 = V_LSHLREV_B16_e64 8, %15, implicit $exec
+ %17:vgpr_32 = V_LSHRREV_B32_e64 16, %14, implicit $exec
+ %18:vgpr_32 = V_AND_B32_e64 %17, 255, implicit $exec
+ %19:vgpr_32 = V_OR_B32_e64 killed %18, killed %16, implicit $exec
+ %20:vgpr_32 = V_LSHLREV_B32_e64 16, killed %19, implicit $exec
+
+ bb.2:
+ %21:vgpr_32 = PHI %4, %bb.0, %20, %bb.1
+ %22:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+ GLOBAL_STORE_BYTE killed %22, %21, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list