[llvm] stop emitting direct copy from intermediate result to out reg (PR #135326)

Thu Apr 17 09:58:08 PDT 2025

================
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
+
+define amdgpu_ps float @uniform_fpext(half inreg %x) {
+; GFX11-LABEL: uniform_fpext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %f = fpext half %x to float
+  ret float %f
+}
+
+define amdgpu_ps i64 @uniform_vbfi_val_op(i32 inreg %a, i32 inreg %b) {
+; GFX11-LABEL: uniform_vbfi_val_op:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, llvm.amdgcn.bfi.i32 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, llvm.amdgcn.bfi.i32 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_mov_b64 s[8:9], 36
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    ; return to shader part epilog
+  %mask = xor i32 -1, 0
+  %bfi = call i32 @llvm.amdgcn.bfi.i32(i32 %mask, i32 %a, i32 %b)
+  %ext = zext i32 %bfi to i64
+  ret i64 %ext
+}
+
+declare i32 @llvm.amdgcn.bfi.i32(i32, i32, i32)
+
+
+define amdgpu_ps <2 x i32> @s_uniform_val_v2i32(<2 x i32> inreg %x, <2 x i32> inreg %y) {
+; GFX11-LABEL: s_uniform_val_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, llvm.amdgcn.bfi.v2i32 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, llvm.amdgcn.bfi.v2i32 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX11-NEXT:    s_mov_b64 s[8:9], 36
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+  %mask = xor <2 x i32> <i32 -1, i32 -1>, zeroinitializer
+  %bfi = call <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32> %mask, <2 x i32> %x, <2 x i32> %y)
+  ret <2 x i32> %bfi
+}
+
+declare <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
----------------
arsenm wrote:

This isn't a legal intrinsic and isn't used 

https://github.com/llvm/llvm-project/pull/135326