[llvm] AMDGPU: Perform zero/any extend combine into permute (PR #177370)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 24 04:45:09 PST 2026
================
@@ -3873,3 +3862,404 @@ define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
store i32 %o0, ptr addrspace(1) %out0, align 4
ret void
}
+
+; Should produce a v_perm instead of shift/and/or
+define amdgpu_kernel void @any_extend_to_perm(i8 %0, <4 x i8> %1) {
+; GFX10-LABEL: any_extend_to_perm:
+; GFX10: ; %bb.0: ; %.lr.ph
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xc0c0006
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, s1, s0, v0
+; GFX10-NEXT: s_and_b32 s0, s1, 0xffff
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: ds_write_b32 v1, v0
+; GFX10-NEXT: s_endpgm
+;
+; GFX9-LABEL: any_extend_to_perm:
+; GFX9: ; %bb.0: ; %.lr.ph
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xc0c0006
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_perm_b32 v0, s1, v1, v0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xffff
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: ds_write_b32 v1, v0
+; GFX9-NEXT: s_endpgm
+.lr.ph:
+ %2 = insertelement <4 x i8> %1, i8 %0, i64 3
+ store <4 x i8> %2, ptr addrspace(3) null, align 4
----------------
arsenm wrote:
Avoid UB in tests. Probably should make this a not-kernel and return the value
https://github.com/llvm/llvm-project/pull/177370
More information about the llvm-commits
mailing list