[llvm] [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction (PR #144423)
Vigneshwar Jayakumar via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 16 12:47:22 PDT 2025
https://github.com/VigneshwarJ created https://github.com/llvm/llvm-project/pull/144423
Permlane_swap instruction depends on exec mask, added isConvergent flag to prevent sinking of instruction.
Fixes SWDEV-537232
>From 430b60ebe05f4215d1d5c54caa45b70157621634 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Mon, 16 Jun 2025 14:29:16 -0500
Subject: [PATCH] [AMDGPU] Fix to prevent sinking of PERMLANE_SWAP instruction
Permlane_swap instruction depends on exec mask, added isConvergent flag
to prevent sinking of instruction.
Fixes SWDEV-537232
---
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 6 +-
llvm/lib/Target/AMDGPU/VOPInstructions.td | 1 +
.../machine-sink-ignorable-exec-use.mir | 61 +++++++++++++++++++
3 files changed, 67 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 7fdd951ecbd3c..d6718c556db1e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -775,7 +775,8 @@ defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
let Constraints = "$vdst = $vdst_in, $src0_out = $src0",
DisableEncoding="$vdst_in,$src0_out",
- SchedRW = [Write32Bit, Write32Bit] in {
+ SchedRW = [Write32Bit, Write32Bit],
+ isConvergent = 1 in {
let SubtargetPredicate = HasPermlane16Swap in {
defm V_PERMLANE16_SWAP_B32 : VOP1Inst<"v_permlane16_swap_b32", VOP_PERMLANE_SWAP>;
}
@@ -1550,8 +1551,11 @@ defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>;
+
+let isConvergent = 1 in {
defm V_PERMLANE16_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x059>;
defm V_PERMLANE32_SWAP_B32 : VOP1_OpSel_Real_e32e64_gfx9<0x05a>;
+}
class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
(vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 952ee2fe2c955..c0c874300ef0f 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -15,6 +15,7 @@ class LetDummies {
bit isConvertibleToThreeAddress;
bit isMoveImm;
bit isReMaterializable;
+ bit isConvergent;
bit isAsCheapAsAMove;
bit FPDPRounding;
Predicate SubtargetPredicate;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 0fc31ea9d6437..3cdba54026ccc 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -733,3 +733,64 @@ body: |
liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
S_ENDPGM 0
...
+---
+name: test_no_sink_permlane_swap
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX9-LABEL: name: test_no_sink_permlane_swap
+ ; GFX9: bb.0:
+ ; GFX9-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GFX9-NEXT: liveins: $vgpr0
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
+ ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ ; GFX9-NEXT: [[V_PERMLANE32_SWAP_B32_e64_:%[0-9]+]]:vgpr_32, [[V_PERMLANE32_SWAP_B32_e64_1:%[0-9]+]]:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], 0, 0, implicit $exec
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX9-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY2]](s32), [[S_MOV_B32_]], implicit $exec
+ ; GFX9-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-NEXT: S_BRANCH %bb.1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1:
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_PERMLANE32_SWAP_B32_e64_]], [[V_PERMLANE32_SWAP_B32_e64_1]], implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2:
+ ; GFX9-NEXT: successors: %bb.3(0x80000000)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_MAX_I32_e64_]], %bb.1
+ ; GFX9-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.3:
+ ; GFX9-NEXT: S_ENDPGM 0, implicit [[PHI]]
+ bb.0:
+ liveins: $vgpr0
+ %1:vgpr_32 = COPY $vgpr0
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %5:sreg_64 = S_MOV_B64 0
+ %7:vreg_64 = COPY %5
+ %9:vgpr_32 = GLOBAL_LOAD_DWORD killed %7, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+ %10:vgpr_32, %11:vgpr_32 = V_PERMLANE32_SWAP_B32_e64 %9:vgpr_32, %9:vgpr_32, 0, 0, implicit $exec
+ %15:vgpr_32(s32) = COPY $vgpr0
+ %16:sreg_32 = S_MOV_B32 1
+ %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
+ %18:sreg_64 = COPY %17
+ %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ %20:vgpr_32 = V_MAX_I32_e64 %10:vgpr_32, %11:vgpr_32, implicit $exec
+
+ bb.2:
+ %22:vgpr_32 = PHI %3, %bb.0, %20, %bb.1
+ SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+ bb.3:
+ S_ENDPGM 0, implicit %22
+...
More information about the llvm-commits
mailing list