[llvm] Mark permlane instructions as convergent (PR #142962)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 5 06:13:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Paul Trojahn (ptrojahn)
<details>
<summary>Changes</summary>
Not sure if this is the right place to add this, but without explicitly setting it as convergent, the permlanex16 instruction in the test sinks into the second block.
---
Full diff: https://github.com/llvm/llvm-project/pull/142962.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+68)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a2672d71cb43c..d63a0c102152a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1333,10 +1333,10 @@ let SubtargetPredicate = isGFX10Plus in {
} // End isCommutable = 1, isReMaterializable = 1
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
- let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in {
defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
- } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1
foreach vt = Reg32Types.types in {
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 693e0ebd0280c..d05501fb0c8b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -10410,3 +10410,71 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32
store <8 x double> %v, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @v_permlanex16_convergent(ptr addrspace(1) %out, i32 %src0, i32 %pattern_lo, i32 %pattern_hi) {
+; GFX10-LABEL: v_permlanex16_convergent:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x34
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: v_permlanex16_b32 v1, v1, s1, s2
+; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX10-NEXT: s_cbranch_execz .LBB142_2
+; GFX10-NEXT: ; %bb.1: ; %t
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: .LBB142_2: ; %f
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_convergent:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_permlanex16_b32 v1, v1, s1, s2
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB142_2
+; GFX11-NEXT: ; %bb.1: ; %t
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: .LBB142_2: ; %f
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_convergent:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_permlanex16_b32 v1, v1, s1, s2
+; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT: s_cbranch_execz .LBB142_2
+; GFX12-NEXT: ; %bb.1: ; %t
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: .LBB142_2: ; %f
+; GFX12-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %pattern_lo, i32 %pattern_hi, i1 false, i1 false)
+ %select = icmp eq i32 %tidx, 0
+ br i1 %select, label %t, label %f
+t:
+ store i32 %v, ptr addrspace(1) %out
+ br label %f
+f:
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/142962
More information about the llvm-commits
mailing list