[llvm] 1b83f10 - [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction (#144423)

Fri Jun 20 04:44:26 PDT 2025

Author: Vigneshwar Jayakumar
Date: 2025-06-20T20:44:23+09:00
New Revision: 1b83f10072b322a206ffcaf737b42fe5c2d95b89

URL: https://github.com/llvm/llvm-project/commit/1b83f10072b322a206ffcaf737b42fe5c2d95b89
DIFF: https://github.com/llvm/llvm-project/commit/1b83f10072b322a206ffcaf737b42fe5c2d95b89.diff

LOG: [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction (#144423)

Permlane_swap instruction depends on exec mask, added isConvergent flag
to prevent sinking of instruction.

Fixes: SWDEV-537232

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/VOP1Instructions.td
    llvm/lib/Target/AMDGPU/VOPInstructions.td
    llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 926df955881e0..02b912bcfb9e0 100644

--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -774,7 +774,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
 
 let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
      DisableEncoding="$vdst_in,$src0_out",
-     SchedRW = [Write32Bit, Write32Bit] in {
+     SchedRW = [Write32Bit, Write32Bit],
+     isConvergent = 1 in {
 let SubtargetPredicate = HasPermlane16Swap in {
 defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>;
 }
@@ -1549,8 +1550,11 @@ defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
 
 defm V_PRNG_B32            : VOP1_Real_gfx9 <0x58>;
+
+let isConvergent = 1 in {
 defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>;
 defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>;
+}
 
 class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
   (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),

diff  --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 6045f59d1f040..19d490465f163 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -15,6 +15,7 @@ class LetDummies {
   bit isConvertibleToThreeAddress;
   bit isMoveImm;
   bit isReMaterializable;
+  bit isConvergent;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
   Predicate SubtargetPredicate;

diff  --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 0fc31ea9d6437..ed22b353b0664 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -733,3 +733,70 @@ body:             |
     liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
     S_ENDPGM 0
 ...
+---
+name: test_no_sink_permlane_swap
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GFX9-LABEL: name: test_no_sink_permlane_swap
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX9-NEXT:   liveins: $vgpr0
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+  ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+  ; GFX9-NEXT:   [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   S_BRANCH %bb.1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1:
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2:
+  ; GFX9-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1
+  ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.3:
+  ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]]
+  bb.0:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $vgpr0
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %2:sreg_64 = S_MOV_B64 0
+    %3:vreg_64 = COPY %2
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD killed %3, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %5:vgpr_32, %6:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %4, %4, 0, 0, implicit $exec
+    %7:vgpr_32 = COPY $vgpr0
+    %8:sreg_32 = S_MOV_B32 1
+    %9:sreg_64 = V_CMP_LT_I32_e64 %7, %8, implicit $exec
+    %10:sreg_64 = COPY %9
+    %11:sreg_64 = SI_IF %10, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %12:vgpr_32 = V_MAX_I32_e64 %5, %6, implicit $exec
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %13:vgpr_32 = PHI %1, %bb.0, %12, %bb.1
+    SI_END_CF %11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.3:
+    S_ENDPGM 0, implicit %13
+...