[llvm] [AMDGPU] auto-generate file check line for amdgcn.bitcast.ll (PR #131955)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 18 19:14:30 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: None (Shoreshen)

<details>
<summary>Changes</summary>

Replace check lines by auto-generated

---

Patch is 1.43 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131955.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll (+29547-164) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 5065f57c67dfd..8da2b552dcfa3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1,12 +1,49 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX11 %s
 
 ; This test just checks that the compiler doesn't crash.
 
-; CHECK-LABEL: {{^}}v32i8_to_v8i32:
+
 define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 {
+; GCN-LABEL: v32i8_to_v8i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; GCN-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: v32i8_to_v8i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x4
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: v32i8_to_v8i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: v32i8_to_v8i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s0
+; GFX11-NEXT:    ; return to shader part epilog
 entry:
   %1 = load <32 x i8>, ptr addrspace(4) %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
@@ -16,9 +53,62 @@ entry:
   ret float %5
 }
 
-; CHECK-LABEL: {{^}}i8ptr_v16i8ptr:
-; CHECK: s_endpgm
 define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: i8ptr_v16i8ptr:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: i8ptr_v16i8ptr:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: i8ptr_v16i8ptr:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: i8ptr_v16i8ptr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = load <16 x i8>, ptr addrspace(1) %in
   store <16 x i8> %0, ptr addrspace(1) %out
@@ -26,6 +116,63 @@ entry:
 }
 
 define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f32_to_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x20000, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: f32_to_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_add_f32_e64 v2, s2, 1.0
+; VI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x20000, v2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: f32_to_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e64 v1, s2, 1.0
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: f32_to_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load float, ptr addrspace(1) %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x i16>
@@ -35,6 +182,63 @@ define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2i16_to_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GCN-NEXT:    s_add_i32 s4, s4, 2
+; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NEXT:    s_or_b32 s4, s5, s4
+; GCN-NEXT:    s_add_i32 s4, s4, 0x20000
+; GCN-NEXT:    v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: v2i16_to_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_add_i32 s0, s0, 0x20000
+; VI-NEXT:    v_add_f32_e64 v2, s0, 1.0
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v2i16_to_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v1, s2, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v2i16_to_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v0, s2, 2 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(1) %in, align 4
   %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
   %bc = bitcast <2 x i16> %add.v2i16 to float
@@ -44,6 +248,67 @@ define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1)
 }
 
 define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f32_to_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_add_f32_e32 v0, 2.0, v0
+; GCN-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: f32_to_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_add_f32_e64 v3, s2, 1.0
+; VI-NEXT:    v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v3, 2.0, v3
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: f32_to_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e64 v1, s2, 1.0
+; GFX9-NEXT:    v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: f32_to_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load float, ptr addrspace(1) %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x half>
@@ -53,6 +318,68 @@ define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2f16_to_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, s4
+; GCN-NEXT:    v_add_f32_e32 v0, 2.0, v0
+; GCN-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: v2f16_to_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e64 v3, s2, 2.0
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v2f16_to_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v2f16_to_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load <2 x half>, ptr addrspace(1) %in, align 4
   %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
   %bc = bitcast <2 x half> %add.v2f16 to float
@@ -62,6 +389,50 @@ define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i8_to_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: v4i8_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v4i8_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v4i8_to_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(1) %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, ptr addrspace(1) %out, align 4
@@ -69,15 +440,112 @@ define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: i32_to_v4i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: i32_to_v4i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: i32_to_v4i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: i32_to_v4i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %load = load i32, ptr addrspace(1) %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; CHECK-LABEL: {{^}}bitcast_v2i32_to_f64:
-; CHECK: s_endpgm
+
 define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: bitcast_v2i32_to_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s5, s5, 9
+; GCN-NEXT:    s_add_i32 s4, s4, 4
+; GCN-NEXT:    v_add_f64 v[0:1], s[4:5], 1.0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: bitcast_v2i32_to_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131955


More information about the llvm-commits mailing list