[llvm] [AMDGPU] si-peephole-sdwa: Fix cndmask vcc use for wave32 (PR #139541)

Frederik Harwath via llvm-commits llvm-commits at lists.llvm.org
Tue May 13 03:15:29 PDT 2025


================
@@ -230,3 +230,92 @@ body:             |
     $vgpr0 = COPY %3
     SI_RETURN implicit $vgpr0
 ...
+
+---
+name: cndmask-not-converted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: cndmask-not-converted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr8_sgpr9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0
+  ; CHECK-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc
+  ; CHECK-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc_lo = COPY [[S_AND_B32_]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+  ; CHECK-NEXT:   [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec
+  ; CHECK-NEXT:   S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc
+  ; CHECK-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+  ; CHECK-NEXT:   $vcc_lo = COPY [[S_CSELECT_B32_1]]
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec
+  ; CHECK-NEXT:   [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec
+  ; CHECK-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec
+  ; CHECK-NEXT:   [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1
+  ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $sgpr8_sgpr9
+
+    %0:sgpr_64(p4) = COPY $sgpr8_sgpr9
+    %1:vgpr_32(s32) = COPY $vgpr0
+    %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
+    S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc
+    %3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc
+    $vcc_lo = COPY %5
+    S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %6:sreg_64 = COPY %2
+    %7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1
+    %10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec
+    %11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec
+    %12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec
+    S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc
+    %13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc
+    %14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec
----------------
frederik-h wrote:

I wasn't able to reduce the test case further. In particular, removing any of the branching removes the error.

https://github.com/llvm/llvm-project/pull/139541


More information about the llvm-commits mailing list