[llvm] Mark permlane instructions as convergent (PR #142962)

Thu Jun 5 06:13:16 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Paul Trojahn (ptrojahn)

<details>
<summary>Changes</summary>

Not sure if this is the right place to add this, but without explicitly setting it as convergent, the permlanex16 instruction in the test sinks into the second block.

---
Full diff: https://github.com/llvm/llvm-project/pull/142962.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+68) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a2672d71cb43c..d63a0c102152a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1333,10 +1333,10 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", isConvergent = 1 in {
     defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+  } // End $vdst = $vdst_in, DisableEncoding $vdst_in, isConvergent = 1
 
   foreach vt = Reg32Types.types in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 693e0ebd0280c..d05501fb0c8b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -10410,3 +10410,71 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32
   store <8 x double> %v, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @v_permlanex16_convergent(ptr addrspace(1) %out, i32 %src0, i32 %pattern_lo, i32 %pattern_hi) {
+; GFX10-LABEL: v_permlanex16_convergent:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x34
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    v_permlanex16_b32 v1, v1, s1, s2
+; GFX10-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB142_2
+; GFX10-NEXT:  ; %bb.1: ; %t
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:  .LBB142_2: ; %f
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_permlanex16_convergent:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_permlanex16_b32 v1, v1, s1, s2
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB142_2
+; GFX11-NEXT:  ; %bb.1: ; %t
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:  .LBB142_2: ; %f
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_permlanex16_convergent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s1, s2
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB142_2
+; GFX12-NEXT:  ; %bb.1: ; %t
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:  .LBB142_2: ; %f
+; GFX12-NEXT:    s_endpgm
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %pattern_lo, i32 %pattern_hi, i1 false, i1 false)
+  %select = icmp eq i32 %tidx, 0
+  br i1 %select, label %t, label %f
+t:
+  store i32 %v, ptr addrspace(1) %out
+  br label %f
+f:
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/142962