[llvm] b907920 - [AMDGPU] auto-generate file check line for amdgcn.bitcast.ll (#131955)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 00:41:04 PDT 2025
Author: Shoreshen
Date: 2025-03-19T15:40:58+08:00
New Revision: b907920058001ecb94fdd6939343fab5b786d96b
URL: https://github.com/llvm/llvm-project/commit/b907920058001ecb94fdd6939343fab5b786d96b
DIFF: https://github.com/llvm/llvm-project/commit/b907920058001ecb94fdd6939343fab5b786d96b.diff
LOG: [AMDGPU] auto-generate file check line for amdgcn.bitcast.ll (#131955)
Replace check lines by auto-generated
Added:
Modified:
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 5065f57c67dfd..8da2b552dcfa3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1,12 +1,49 @@
-; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX11 %s
; This test just checks that the compiler doesn't crash.
-; CHECK-LABEL: {{^}}v32i8_to_v8i32:
+
define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 {
+; GCN-LABEL: v32i8_to_v8i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; GCN-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: v32i8_to_v8i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[0:1], 0x4
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: v32i8_to_v8i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: v32i8_to_v8i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0
+; GFX11-NEXT: ; return to shader part epilog
entry:
%1 = load <32 x i8>, ptr addrspace(4) %0
%2 = bitcast <32 x i8> %1 to <8 x i32>
@@ -16,9 +53,62 @@ entry:
ret float %5
}
-; CHECK-LABEL: {{^}}i8ptr_v16i8ptr:
-; CHECK: s_endpgm
define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: i8ptr_v16i8ptr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: i8ptr_v16i8ptr:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: i8ptr_v16i8ptr:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: i8ptr_v16i8ptr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
entry:
%0 = load <16 x i8>, ptr addrspace(1) %in
store <16 x i8> %0, ptr addrspace(1) %out
@@ -26,6 +116,63 @@ entry:
}
define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f32_to_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: f32_to_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f32_e64 v2, s2, 1.0
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: f32_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: f32_to_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %in, align 4
%fadd32 = fadd float %load, 1.0
%bc = bitcast float %fadd32 to <2 x i16>
@@ -35,6 +182,63 @@ define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
}
define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2i16_to_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 2
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_or_b32 s4, s5, s4
+; GCN-NEXT: s_add_i32 s4, s4, 0x20000
+; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2i16_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; VI-NEXT: s_add_i32 s2, s2, 2
+; VI-NEXT: s_and_b32 s1, s2, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_add_i32 s0, s0, 0x20000
+; VI-NEXT: v_add_f32_e64 v2, s0, 1.0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2i16_to_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_u16 v1, s2, 2 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2i16_to_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v0, s2, 2 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x i16>, ptr addrspace(1) %in, align 4
%add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
%bc = bitcast <2 x i16> %add.v2i16 to float
@@ -44,6 +248,67 @@ define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1)
}
define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f32_to_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
+; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: f32_to_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f32_e64 v3, s2, 1.0
+; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v3, 2.0, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: f32_to_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0
+; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: f32_to_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %in, align 4
%fadd32 = fadd float %load, 1.0
%bc = bitcast float %fadd32 to <2 x half>
@@ -53,6 +318,68 @@ define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
}
define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2f16_to_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
+; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
+; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2f16_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v3, s2, 2.0
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_add_f32_e32 v2, 1.0, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2f16_to_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2f16_to_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x half>, ptr addrspace(1) %in, align 4
%add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
%bc = bitcast <2 x half> %add.v2f16 to float
@@ -62,6 +389,50 @@ define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1)
}
define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i8_to_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4i8_to_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4i8_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4i8_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x i8>, ptr addrspace(1) %in, align 4
%bc = bitcast <4 x i8> %load to i32
store i32 %bc, ptr addrspace(1) %out, align 4
@@ -69,15 +440,112 @@ define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %
}
define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: i32_to_v4i8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: i32_to_v4i8:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: i32_to_v4i8:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: i32_to_v4i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%bc = bitcast i32 %load to <4 x i8>
store <4 x i8> %bc, ptr addrspace(1) %out, align 4
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v2i32_to_f64:
-; CHECK: s_endpgm
+
define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: bitcast_v2i32_to_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s5, s5, 9
+; GCN-NEXT: s_add_i32 s4, s4, 4
+; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v2i32_to_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s3, s3, 9
+; VI-NEXT: s_add_i32 s2, s2, 4
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v2i32_to_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s3, s3, 9
+; GFX9-NEXT: s_add_i32 s2, s2, 4
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v2i32_to_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s3, 9
+; GFX11-NEXT: s_add_i32 s2, s2, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%val = load <2 x i32>, ptr addrspace(1) %in, align 8
%add = add <2 x i32> %val, <i32 4, i32 9>
%bc = bitcast <2 x i32> %add to double
@@ -86,9 +554,53 @@ define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrs
ret void
}
-; CHECK-LABEL: {{^}}bitcast_f64_to_v2i32:
-; CHECK: s_endpgm
+
define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: bitcast_f64_to_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 4.0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_f64_to_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_f64_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_f64_to_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%val = load double, ptr addrspace(1) %in, align 8
%add = fadd double %val, 4.0
%bc = bitcast double %add to <2 x i32>
@@ -96,8 +608,114 @@ define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrs
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v2i64_to_v2f64:
+
define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
+; GCN-LABEL: bitcast_v2i64_to_v2f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s9, s[4:5], 0x9
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN-NEXT: s_mov_b32 s8, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s9, 0
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_mov_b32 s10, s8
+; GCN-NEXT: s_mov_b32 s11, s8
+; GCN-NEXT: s_cbranch_scc1 .LBB10_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: s_mov_b32 s4, s2
+; GCN-NEXT: s_mov_b32 s5, s3
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT: .LBB10_2: ; %end
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v2i64_to_v2f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s11, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT: s_mov_b32 s8, 0
+; VI-NEXT: s_mov_b32 s9, s8
+; VI-NEXT: s_mov_b32 s10, s8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s11, 0
+; VI-NEXT: s_mov_b32 s11, s8
+; VI-NEXT: s_cbranch_scc1 .LBB10_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: s_mov_b32 s4, s2
+; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; VI-NEXT: s_mov_b64 s[8:9], s[4:5]
+; VI-NEXT: .LBB10_2: ; %end
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v2i64_to_v2f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT: s_mov_b32 s8, 0
+; GFX9-NEXT: s_mov_b32 s9, s8
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s11, 0
+; GFX9-NEXT: s_mov_b32 s11, s8
+; GFX9-NEXT: s_cbranch_scc1 .LBB10_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX9-NEXT: .LBB10_2: ; %end
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v2i64_to_v2f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_mov_b32 s8, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s9, s8
+; GFX11-NEXT: s_mov_b32 s10, s8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s11, 0
+; GFX11-NEXT: s_mov_b32 s11, s8
+; GFX11-NEXT: s_cbranch_scc1 .LBB10_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_mov_b32 s4, s2
+; GFX11-NEXT: s_mov_b32 s5, s3
+; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX11-NEXT: .LBB10_2: ; %end
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: v_mov_b32_e32 v2, s10
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -112,8 +730,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v2f64_to_v2i64:
+
define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
+; GCN-LABEL: bitcast_v2f64_to_v2i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s9, s[4:5], 0x9
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf
+; GCN-NEXT: s_mov_b32 s8, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s9, 0
+; GCN-NEXT: s_mov_b32 s9, s8
+; GCN-NEXT: s_mov_b32 s10, s8
+; GCN-NEXT: s_mov_b32 s11, s8
+; GCN-NEXT: s_cbranch_scc1 .LBB11_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: s_mov_b32 s4, s2
+; GCN-NEXT: s_mov_b32 s5, s3
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT: .LBB11_2: ; %end
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v2f64_to_v2i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s11, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; VI-NEXT: s_mov_b32 s8, 0
+; VI-NEXT: s_mov_b32 s9, s8
+; VI-NEXT: s_mov_b32 s10, s8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s11, 0
+; VI-NEXT: s_mov_b32 s11, s8
+; VI-NEXT: s_cbranch_scc1 .LBB11_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: s_mov_b32 s4, s2
+; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; VI-NEXT: s_mov_b64 s[8:9], s[4:5]
+; VI-NEXT: .LBB11_2: ; %end
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v2f64_to_v2i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX9-NEXT: s_mov_b32 s8, 0
+; GFX9-NEXT: s_mov_b32 s9, s8
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s11, 0
+; GFX9-NEXT: s_mov_b32 s11, s8
+; GFX9-NEXT: s_cbranch_scc1 .LBB11_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX9-NEXT: .LBB11_2: ; %end
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v2f64_to_v2i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
+; GFX11-NEXT: s_mov_b32 s8, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s9, s8
+; GFX11-NEXT: s_mov_b32 s10, s8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s11, 0
+; GFX11-NEXT: s_mov_b32 s11, s8
+; GFX11-NEXT: s_cbranch_scc1 .LBB11_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_mov_b32 s4, s2
+; GFX11-NEXT: s_mov_b32 s5, s3
+; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX11-NEXT: .LBB11_2: ; %end
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: v_mov_b32_e32 v2, s10
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -128,8 +852,78 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v4i16_to_f64:
+
define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i16_to_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GCN-NEXT: s_add_i32 s5, s5, 4
+; GCN-NEXT: s_and_b32 s7, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 4
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_or_b32 s5, s6, s5
+; GCN-NEXT: s_or_b32 s4, s7, s4
+; GCN-NEXT: s_add_i32 s5, s5, 0x40000
+; GCN-NEXT: s_add_i32 s4, s4, 0x40000
+; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4i16_to_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s4, s2, 0xffff0000
+; VI-NEXT: s_add_i32 s2, s2, 4
+; VI-NEXT: s_and_b32 s5, s3, 0xffff0000
+; VI-NEXT: s_add_i32 s3, s3, 4
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s3, s5, s3
+; VI-NEXT: s_or_b32 s2, s4, s2
+; VI-NEXT: s_add_i32 s3, s3, 0x40000
+; VI-NEXT: s_add_i32 s2, s2, 0x40000
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4i16_to_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4i16_to_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to double
@@ -138,8 +932,87 @@ define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}v4f16_to_f64:
+
define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4f16_to_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
+; GCN-NEXT: s_lshr_b32 s5, s5, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5
+; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v1, v1, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4f16_to_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x4400
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s4, s3, 16
+; VI-NEXT: v_add_f16_e64 v1, s3, 4.0
+; VI-NEXT: s_lshr_b32 s3, s2, 16
+; VI-NEXT: v_mov_b32_e32 v3, s4
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_f16_e64 v2, s2, 4.0
+; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v1, v1, v3
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4f16_to_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4f16_to_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to double
@@ -148,8 +1021,83 @@ define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}f64_to_v4f16:
+
define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f64_to_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0
+; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v1, v1, v2
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: f64_to_v4f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v4, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v1, 2.0, v1
+; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
+; VI-NEXT: v_or_b32_e32 v1, v1, v5
+; VI-NEXT: v_or_b32_e32 v0, v0, v4
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: f64_to_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: f64_to_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load double, ptr addrspace(1) %in, align 4
%fadd32 = fadd double %load, 1.0
%bc = bitcast double %fadd32 to <4 x half>
@@ -158,8 +1106,76 @@ define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}f64_to_v4i16:
+
define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: f64_to_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 2, v1
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v0, v3, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x20000, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: f64_to_v4i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1
+; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x20000, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0x20000, v0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: f64_to_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: f64_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load double, ptr addrspace(1) %in, align 4
%fadd32 = fadd double %load, 1.0
%bc = bitcast double %fadd32 to <4 x i16>
@@ -168,8 +1184,86 @@ define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}v4i16_to_i64:
+
define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i16_to_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000
+; GCN-NEXT: s_add_i32 s5, s5, 4
+; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 4
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_or_b32 s2, s2, s5
+; GCN-NEXT: s_or_b32 s4, s6, s4
+; GCN-NEXT: s_add_i32 s2, s2, 0x40000
+; GCN-NEXT: s_add_i32 s4, s4, 0x40000
+; GCN-NEXT: s_add_u32 s4, s4, 1
+; GCN-NEXT: s_addc_u32 s5, s2, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4i16_to_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; VI-NEXT: s_add_i32 s1, s2, 4
+; VI-NEXT: s_and_b32 s2, s3, 0xffff0000
+; VI-NEXT: s_add_i32 s3, s3, 4
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_add_i32 s2, s2, 0x40000
+; VI-NEXT: s_add_i32 s0, s0, 0x40000
+; VI-NEXT: s_add_u32 s0, s0, 1
+; VI-NEXT: s_addc_u32 s1, s2, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4i16_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to i64
@@ -178,8 +1272,91 @@ define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}v4f16_to_i64:
+
define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4f16_to_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
+; GCN-NEXT: s_lshr_b32 s5, s5, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5
+; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v1, v1, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4f16_to_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s0, s3, 16
+; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v6, s1
+; VI-NEXT: v_add_f16_e64 v4, s2, 4.0
+; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v3, s3, 4.0
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: v_or_b32_e32 v3, v3, v5
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4f16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4f16_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to i64
@@ -188,8 +1365,86 @@ define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; CHECK-LABEL: {{^}}bitcast_i64_to_v4i16:
+
define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: bitcast_i64_to_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s2, s4, 4
+; GCN-NEXT: s_addc_u32 s4, s5, 0
+; GCN-NEXT: s_and_b32 s5, s2, 0xffff0000
+; GCN-NEXT: s_add_i32 s2, s2, 1
+; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 3
+; GCN-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_or_b32 s2, s5, s2
+; GCN-NEXT: s_or_b32 s4, s6, s4
+; GCN-NEXT: s_add_i32 s5, s2, 0x20000
+; GCN-NEXT: s_add_i32 s4, s4, 0x40000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_i64_to_v4i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s2, 4
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_and_b32 s2, s0, 0xffff0000
+; VI-NEXT: s_add_i32 s0, s0, 1
+; VI-NEXT: s_and_b32 s3, s1, 0xffff0000
+; VI-NEXT: s_add_i32 s1, s1, 3
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_or_b32 s1, s3, s1
+; VI-NEXT: s_add_i32 s0, s0, 0x20000
+; VI-NEXT: s_add_i32 s1, s1, 0x40000
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_i64_to_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: v_pk_add_u16 v1, s3, v0
+; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_i64_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%add = add i64 %val, 4
%bc = bitcast i64 %add to <4 x i16>
@@ -198,8 +1453,93 @@ define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrs
ret void
}
-; CHECK-LABEL: {{^}}bitcast_i64_to_v4f16:
+
define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: bitcast_i64_to_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s4, s4, 4
+; GCN-NEXT: s_addc_u32 s5, s5, 0
+; GCN-NEXT: s_lshr_b32 s6, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT: s_lshr_b32 s4, s5, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s6
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
+; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v4
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_i64_to_v4f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4800
+; VI-NEXT: v_mov_b32_e32 v3, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s2, 4
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_lshr_b32 s3, s1, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_add_f16_e64 v4, s1, 4.0
+; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v5, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v4, v2
+; VI-NEXT: v_add_f16_e64 v2, s0, 1.0
+; VI-NEXT: v_or_b32_e32 v2, v2, v5
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_i64_to_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: v_pk_add_f16 v1, s3, v0
+; GFX9-NEXT: v_pk_add_f16 v0, s2, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_i64_to_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
+; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%add = add i64 %val, 4
%bc = bitcast i64 %add to <4 x half>
@@ -208,8 +1548,81 @@ define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrs
ret void
}
-; CHECK-LABEL: {{^}}v4i16_to_v2f32:
+
define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i16_to_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 4
+; GCN-NEXT: s_and_b32 s7, s5, 0xffff0000
+; GCN-NEXT: s_add_i32 s5, s5, 4
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: s_or_b32 s4, s6, s4
+; GCN-NEXT: s_or_b32 s5, s7, s5
+; GCN-NEXT: s_add_i32 s4, s4, 0x40000
+; GCN-NEXT: s_add_i32 s5, s5, 0x40000
+; GCN-NEXT: v_add_f32_e64 v1, s5, 1.0
+; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4i16_to_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s0, s3, 0xffff0000
+; VI-NEXT: s_add_i32 s1, s3, 4
+; VI-NEXT: s_and_b32 s3, s2, 0xffff0000
+; VI-NEXT: s_add_i32 s2, s2, 4
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_add_i32 s2, s2, 0x40000
+; VI-NEXT: s_add_i32 s0, s0, 0x40000
+; VI-NEXT: v_add_f32_e64 v3, s0, 1.0
+; VI-NEXT: v_add_f32_e64 v2, s2, 1.0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4i16_to_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4i16_to_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to <2 x float>
@@ -218,8 +1631,90 @@ define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v4f16_to_v2f32:
+
define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4f16_to_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5
+; GCN-NEXT: s_lshr_b32 s5, s5, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
+; GCN-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
+; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v3, v1, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: v_add_f32_e32 v1, 1.0, v0
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v3
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4f16_to_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s1, s3, 16
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v6, s1
+; VI-NEXT: v_add_f16_e64 v3, s2, 4.0
+; VI-NEXT: v_add_f16_e64 v4, s3, 4.0
+; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v3, v5
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: v_add_f32_e32 v3, 1.0, v2
+; VI-NEXT: v_add_f32_e32 v2, 1.0, v5
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4f16_to_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4f16_to_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to <2 x float>
@@ -228,8 +1723,81 @@ define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v2f32_to_v4i16:
+
define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2f32_to_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e64 v0, s4, 2.0
+; GCN-NEXT: v_add_f32_e64 v1, s5, 4.0
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v0, v3, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x40000, v1
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2f32_to_v4i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f32_e64 v2, s3, 4.0
+; VI-NEXT: v_add_f32_e64 v3, s2, 2.0
+; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
+; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v3, vcc, 0x40000, v2
+; VI-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2f32_to_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s2, 0x40003
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0
+; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0
+; GFX9-NEXT: v_pk_add_u16 v1, v1, s2
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2f32_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0
+; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, v0
+; GFX11-NEXT: v_pk_sub_u16 v0, v2, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x float>, ptr addrspace(1) %in, align 4
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
%bc = bitcast <2 x float> %add.v2f32 to <4 x i16>
@@ -238,8 +1806,90 @@ define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v2f32_to_v4f16:
+
define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2f32_to_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_f32_e64 v0, s5, 4.0
+; GCN-NEXT: v_add_f32_e64 v1, s4, 2.0
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_add_f32_e32 v2, 0x41000000, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v1, v0, v1
+; GCN-NEXT: v_or_b32_e32 v0, v4, v2
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2f32_to_v4f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4800
+; VI-NEXT: v_mov_b32_e32 v3, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f32_e64 v4, s2, 2.0
+; VI-NEXT: v_add_f32_e64 v5, s3, 4.0
+; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v5, 4.0, v5
+; VI-NEXT: v_add_f16_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v4, 1.0, v4
+; VI-NEXT: v_or_b32_e32 v3, v5, v2
+; VI-NEXT: v_or_b32_e32 v2, v4, v6
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2f32_to_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s2, 0x48004400
+; GFX9-NEXT: s_mov_b32 s3, 0x40003c00
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0
+; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0
+; GFX9-NEXT: v_pk_add_f16 v1, v1, s2
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2f32_to_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0
+; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, v0
+; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x float>, ptr addrspace(1) %in, align 4
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
%bc = bitcast <2 x float> %add.v2f32 to <4 x half>
@@ -248,8 +1898,82 @@ define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v4i16_to_v2i32:
+
define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4i16_to_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s2, s4, 0xffff0000
+; GCN-NEXT: s_add_i32 s4, s4, 4
+; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000
+; GCN-NEXT: s_add_i32 s5, s5, 4
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: s_or_b32 s2, s2, s4
+; GCN-NEXT: s_or_b32 s4, s6, s5
+; GCN-NEXT: s_add_i32 s4, s4, 0x40001
+; GCN-NEXT: s_add_i32 s5, s2, 0x40001
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4i16_to_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s0, s3, 0xffff0000
+; VI-NEXT: s_add_i32 s1, s3, 4
+; VI-NEXT: s_and_b32 s3, s2, 0xffff0000
+; VI-NEXT: s_add_i32 s2, s2, 4
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_add_i32 s0, s0, 0x40001
+; VI-NEXT: s_add_i32 s2, s2, 0x40001
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4i16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4i16_to_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0]
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
%bc = bitcast <4 x i16> %add.v4i16 to <2 x i32>
@@ -258,8 +1982,91 @@ define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v4f16_to_v2i32:
+
define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v4f16_to_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5
+; GCN-NEXT: s_lshr_b32 s5, s5, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4
+; GCN-NEXT: s_lshr_b32 s4, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4
+; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v3, v1, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v3
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v4f16_to_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s1, s3, 16
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v6, s1
+; VI-NEXT: v_add_f16_e64 v3, s2, 4.0
+; VI-NEXT: v_add_f16_e64 v4, s3, 4.0
+; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v3, v5
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v5
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v4f16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v4f16_to_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0]
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
%bc = bitcast <4 x half> %add.v4half to <2 x i32>
@@ -268,8 +2075,86 @@ define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v2i32_to_v4i16:
+
define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2i32_to_v4i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s2, s4, 2
+; GCN-NEXT: s_add_i32 s6, s5, 4
+; GCN-NEXT: s_add_i32 s5, s5, 7
+; GCN-NEXT: s_add_i32 s4, s4, 3
+; GCN-NEXT: s_and_b32 s5, s5, 0xffff
+; GCN-NEXT: s_and_b32 s6, s6, 0xffff0000
+; GCN-NEXT: s_and_b32 s4, s4, 0xffff
+; GCN-NEXT: s_and_b32 s2, s2, 0xffff0000
+; GCN-NEXT: s_or_b32 s5, s6, s5
+; GCN-NEXT: s_or_b32 s2, s2, s4
+; GCN-NEXT: s_add_i32 s5, s5, 0x40000
+; GCN-NEXT: s_add_i32 s4, s2, 0x20000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2i32_to_v4i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s0, s3, 4
+; VI-NEXT: s_add_i32 s1, s2, 2
+; VI-NEXT: s_add_i32 s2, s2, 3
+; VI-NEXT: s_add_i32 s3, s3, 7
+; VI-NEXT: s_and_b32 s1, s1, 0xffff0000
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s3
+; VI-NEXT: s_or_b32 s1, s1, s2
+; VI-NEXT: s_add_i32 s0, s0, 0x40000
+; VI-NEXT: s_add_i32 s1, s1, 0x20000
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2i32_to_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s2, s4, 2
+; GFX9-NEXT: s_add_i32 s3, s5, 4
+; GFX9-NEXT: v_pk_add_u16 v1, s3, v0
+; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2i32_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s3, 4
+; GFX11-NEXT: s_add_i32 s2, s2, 2
+; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3
+; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x i32>, ptr addrspace(1) %in, align 4
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
%bc = bitcast <2 x i32> %add.v2i32 to <4 x i16>
@@ -278,8 +2163,93 @@ define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; CHECK-LABEL: {{^}}v2i32_to_v4f16:
+
define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GCN-LABEL: v2i32_to_v4f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_i32 s5, s5, 4
+; GCN-NEXT: s_add_i32 s4, s4, 2
+; GCN-NEXT: s_lshr_b32 s6, s5, 16
+; GCN-NEXT: s_lshr_b32 s7, s4, 16
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, s7
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, s6
+; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_or_b32_e32 v1, v1, v3
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: v2i32_to_v4f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x4800
+; VI-NEXT: v_mov_b32_e32 v4, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s1, s3, 4
+; VI-NEXT: s_add_i32 s0, s2, 2
+; VI-NEXT: s_lshr_b32 s2, s1, 16
+; VI-NEXT: v_add_f16_e64 v3, s1, 4.0
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: v_mov_b32_e32 v6, s1
+; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v3, v2
+; VI-NEXT: v_add_f16_sdwa v2, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v4, s0, 1.0
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v2i32_to_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s2, s4, 2
+; GFX9-NEXT: s_add_i32 s3, s5, 4
+; GFX9-NEXT: v_pk_add_f16 v1, s3, v0
+; GFX9-NEXT: v_pk_add_f16 v0, s2, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: v2i32_to_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_i32 s3, s3, 4
+; GFX11-NEXT: s_add_i32 s2, s2, 2
+; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3
+; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%load = load <2 x i32>, ptr addrspace(1) %in, align 4
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
%bc = bitcast <2 x i32> %add.v2i32 to <4 x half>
@@ -290,9 +2260,1157 @@ define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1
declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg)
-; CHECK-LABEL: {{^}}bitcast_v4f32_to_v2i64:
-; CHECK: s_buffer_load_{{dwordx4|b128}}
+
+
define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
+; GCN-LABEL: bitcast_v4f32_to_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v5, v1
+; GCN-NEXT: v_mov_b32_e32 v4, v0
+; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v1, s9, v5
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB28_2
+; GCN-NEXT: ; %bb.1:
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v5
+; GCN-NEXT: s_mov_b32 s4, 0x4f800000
+; GCN-NEXT: s_mov_b32 s5, 0xcf800000
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc
+; GCN-NEXT: v_mov_b32_e32 v8, s9
+; GCN-NEXT: v_fma_f32 v0, v1, s4, v0
+; GCN-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GCN-NEXT: v_trunc_f32_e32 v1, v1
+; GCN-NEXT: v_fma_f32 v0, v1, s5, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_mul_lo_u32 v9, v6, v1
+; GCN-NEXT: v_mul_lo_u32 v10, v7, v0
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v0
+; GCN-NEXT: v_mul_lo_u32 v12, v6, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GCN-NEXT: v_mul_hi_u32 v11, v0, v12
+; GCN-NEXT: v_mul_hi_u32 v13, v1, v12
+; GCN-NEXT: v_mul_lo_u32 v12, v1, v12
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GCN-NEXT: v_mul_hi_u32 v10, v0, v9
+; GCN-NEXT: v_mul_lo_u32 v14, v0, v9
+; GCN-NEXT: v_mul_hi_u32 v15, v1, v9
+; GCN-NEXT: v_mul_lo_u32 v9, v1, v9
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc
+; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v0
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v0
+; GCN-NEXT: v_mul_lo_u32 v10, v6, v0
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v1
+; GCN-NEXT: v_mul_hi_u32 v11, v1, v10
+; GCN-NEXT: v_mul_lo_u32 v12, v1, v10
+; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GCN-NEXT: v_mul_hi_u32 v7, v1, v6
+; GCN-NEXT: v_mul_hi_u32 v9, v0, v6
+; GCN-NEXT: v_mul_lo_u32 v13, v0, v6
+; GCN-NEXT: v_mul_lo_u32 v6, v1, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v6, s8, v0
+; GCN-NEXT: v_mul_hi_u32 v7, s9, v0
+; GCN-NEXT: v_mul_lo_u32 v0, s9, v0
+; GCN-NEXT: v_mul_hi_u32 v9, s8, v1
+; GCN-NEXT: v_mul_lo_u32 v10, s8, v1
+; GCN-NEXT: v_mul_hi_u32 v11, s9, v1
+; GCN-NEXT: v_mul_lo_u32 v1, s9, v1
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GCN-NEXT: v_addc_u32_e32 v0, vcc, v9, v7, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GCN-NEXT: v_mul_hi_u32 v6, v4, v0
+; GCN-NEXT: v_mul_lo_u32 v7, v5, v0
+; GCN-NEXT: v_mul_lo_u32 v9, v4, v0
+; GCN-NEXT: v_mul_lo_u32 v10, v4, v1
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v0
+; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v0
+; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, s9, v6
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, s8, v9
+; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v5, vcc
+; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
+; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v4
+; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
+; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5
+; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v4, v13, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN-NEXT: .LBB28_2: ; %Flow1
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB28_4
+; GCN-NEXT: ; %bb.3:
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4
+; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
+; GCN-NEXT: v_mul_hi_u32 v0, v1, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: v_mul_lo_u32 v1, v0, v4
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: .LBB28_4:
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_or_b32_e32 v5, s11, v3
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB28_6
+; GCN-NEXT: ; %bb.5:
+; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GCN-NEXT: s_mov_b32 s4, 0x4f800000
+; GCN-NEXT: s_mov_b32 s5, 0xcf800000
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT: v_mov_b32_e32 v8, s11
+; GCN-NEXT: v_fma_f32 v4, v5, s4, v4
+; GCN-NEXT: v_rcp_f32_e32 v4, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GCN-NEXT: v_trunc_f32_e32 v5, v5
+; GCN-NEXT: v_fma_f32 v4, v5, s5, v4
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT: v_mul_lo_u32 v9, v6, v5
+; GCN-NEXT: v_mul_lo_u32 v10, v7, v4
+; GCN-NEXT: v_mul_hi_u32 v11, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v12, v6, v4
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GCN-NEXT: v_mul_hi_u32 v11, v4, v12
+; GCN-NEXT: v_mul_hi_u32 v13, v5, v12
+; GCN-NEXT: v_mul_lo_u32 v12, v5, v12
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GCN-NEXT: v_mul_hi_u32 v10, v4, v9
+; GCN-NEXT: v_mul_lo_u32 v14, v4, v9
+; GCN-NEXT: v_mul_hi_u32 v15, v5, v9
+; GCN-NEXT: v_mul_lo_u32 v9, v5, v9
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc
+; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v10, vcc
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v4
+; GCN-NEXT: v_mul_lo_u32 v10, v6, v4
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v5
+; GCN-NEXT: v_mul_hi_u32 v11, v5, v10
+; GCN-NEXT: v_mul_lo_u32 v12, v5, v10
+; GCN-NEXT: v_mul_hi_u32 v10, v4, v10
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GCN-NEXT: v_mul_hi_u32 v7, v5, v6
+; GCN-NEXT: v_mul_hi_u32 v9, v4, v6
+; GCN-NEXT: v_mul_lo_u32 v13, v4, v6
+; GCN-NEXT: v_mul_lo_u32 v6, v5, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GCN-NEXT: v_mul_hi_u32 v6, s10, v4
+; GCN-NEXT: v_mul_hi_u32 v7, s11, v4
+; GCN-NEXT: v_mul_lo_u32 v4, s11, v4
+; GCN-NEXT: v_mul_hi_u32 v9, s10, v5
+; GCN-NEXT: v_mul_lo_u32 v10, s10, v5
+; GCN-NEXT: v_mul_hi_u32 v11, s11, v5
+; GCN-NEXT: v_mul_lo_u32 v5, s11, v5
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v7, v3, v4
+; GCN-NEXT: v_mul_lo_u32 v9, v2, v4
+; GCN-NEXT: v_mul_lo_u32 v10, v2, v5
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v4
+; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v4
+; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GCN-NEXT: v_sub_i32_e32 v7, vcc, s11, v6
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, s10, v9
+; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v3, vcc
+; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2
+; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v2
+; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
+; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5]
+; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GCN-NEXT: .LBB28_6: ; %Flow
+; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GCN-NEXT: s_cbranch_execz .LBB28_8
+; GCN-NEXT: ; %bb.7:
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT: v_mul_lo_u32 v3, v3, v4
+; GCN-NEXT: v_mul_hi_u32 v3, v4, v3
+; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v3
+; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v4, v2
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GCN-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: .LBB28_8:
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bitcast_v4f32_to_v2i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v5, v1
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_or_b32_e32 v1, s9, v5
+; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB28_2
+; VI-NEXT: ; %bb.1:
+; VI-NEXT: v_cvt_f32_u32_e32 v0, v4
+; VI-NEXT: v_cvt_f32_u32_e32 v1, v5
+; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
+; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; VI-NEXT: v_rcp_f32_e32 v0, v0
+; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; VI-NEXT: v_trunc_f32_e32 v1, v1
+; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; VI-NEXT: v_cvt_u32_f32_e32 v8, v1
+; VI-NEXT: v_cvt_u32_f32_e32 v9, v0
+; VI-NEXT: v_mul_lo_u32 v6, v10, v8
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
+; VI-NEXT: v_mul_lo_u32 v7, v11, v9
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v6
+; VI-NEXT: v_add_u32_e32 v13, vcc, v1, v7
+; VI-NEXT: v_mul_hi_u32 v12, v9, v0
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v13, 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6
+; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v7, vcc
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v13, 0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v12, v0
+; VI-NEXT: v_addc_u32_e32 v0, vcc, v14, v1, vcc
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v0
+; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v1, vcc
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0
+; VI-NEXT: v_mul_lo_u32 v8, v10, v13
+; VI-NEXT: v_mul_lo_u32 v9, v11, v12
+; VI-NEXT: v_mul_hi_u32 v10, v12, v0
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v0, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9
+; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v1, 0
+; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v1, vcc
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0
+; VI-NEXT: v_mul_hi_u32 v8, s8, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v8, v0
+; VI-NEXT: v_addc_u32_e32 v0, vcc, v9, v1, vcc
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v0, v6
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: v_mul_lo_u32 v8, v4, v7
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
+; VI-NEXT: v_mul_lo_u32 v9, v5, v6
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9
+; VI-NEXT: v_sub_u32_e32 v8, vcc, s9, v1
+; VI-NEXT: v_sub_u32_e32 v0, vcc, s8, v0
+; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v5, vcc
+; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v0, v4
+; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
+; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
+; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
+; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
+; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6
+; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5]
+; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6
+; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5]
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v10, s9
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v10, v1, vcc
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
+; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; VI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT: .LBB28_2: ; %Flow1
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; VI-NEXT: s_cbranch_execz .LBB28_4
+; VI-NEXT: ; %bb.3:
+; VI-NEXT: v_cvt_f32_u32_e32 v0, v4
+; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v4
+; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
+; VI-NEXT: v_mul_lo_u32 v1, v1, v0
+; VI-NEXT: v_mul_hi_u32 v1, v0, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT: v_mul_hi_u32 v0, s8, v0
+; VI-NEXT: v_mul_lo_u32 v1, v0, v4
+; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0
+; VI-NEXT: v_sub_u32_e32 v1, vcc, s8, v1
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v1, v4
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: .LBB28_4:
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_or_b32_e32 v5, s11, v3
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; VI-NEXT: ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; VI-NEXT: s_cbranch_execz .LBB28_6
+; VI-NEXT: ; %bb.5:
+; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
+; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
+; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v2
+; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; VI-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
+; VI-NEXT: v_rcp_f32_e32 v4, v4
+; VI-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; VI-NEXT: v_trunc_f32_e32 v5, v5
+; VI-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
+; VI-NEXT: v_cvt_u32_f32_e32 v8, v5
+; VI-NEXT: v_cvt_u32_f32_e32 v9, v4
+; VI-NEXT: v_mul_lo_u32 v6, v10, v8
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; VI-NEXT: v_mul_lo_u32 v7, v11, v9
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6
+; VI-NEXT: v_add_u32_e32 v7, vcc, v5, v7
+; VI-NEXT: v_mul_hi_u32 v12, v9, v4
+; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v5
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; VI-NEXT: v_add_u32_e32 v4, vcc, v12, v4
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v13, v5, vcc
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v4
+; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v5, vcc
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
+; VI-NEXT: v_mul_lo_u32 v8, v10, v13
+; VI-NEXT: v_mul_lo_u32 v9, v11, v12
+; VI-NEXT: v_mul_hi_u32 v10, v12, v4
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v4, 0
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, 0
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v5, 0
+; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v4
+; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v5, vcc
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
+; VI-NEXT: v_mul_hi_u32 v8, s10, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0
+; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v6
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; VI-NEXT: v_mul_lo_u32 v8, v2, v7
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
+; VI-NEXT: v_mul_lo_u32 v9, v3, v6
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
+; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: v_sub_u32_e32 v8, vcc, s11, v5
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4
+; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v3, vcc
+; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v4, v2
+; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
+; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
+; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
+; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6
+; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5]
+; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6
+; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5]
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v10, s11
+; VI-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc
+; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
+; VI-NEXT: .LBB28_6: ; %Flow
+; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; VI-NEXT: s_cbranch_execz .LBB28_8
+; VI-NEXT: ; %bb.7:
+; VI-NEXT: v_cvt_f32_u32_e32 v3, v2
+; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
+; VI-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; VI-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; VI-NEXT: v_cvt_u32_f32_e32 v3, v3
+; VI-NEXT: v_mul_lo_u32 v4, v4, v3
+; VI-NEXT: v_mul_hi_u32 v4, v3, v4
+; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
+; VI-NEXT: v_mul_hi_u32 v3, s10, v3
+; VI-NEXT: v_mul_lo_u32 v4, v3, v2
+; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v2
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3
+; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: .LBB28_8:
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v2, v4
+; VI-NEXT: v_mov_b32_e32 v3, v5
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: bitcast_v4f32_to_v2i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v5, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v1, s9, v5
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB28_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v5, vcc
+; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
+; GFX9-NEXT: v_rcp_f32_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX9-NEXT: v_trunc_f32_e32 v1, v1
+; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1
+; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v0
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v12, v1, v6, v7
+; GFX9-NEXT: v_mul_hi_u32 v1, v9, v0
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v12, 0
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v1, v6
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v7, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v12, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v14, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0
+; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0
+; GFX9-NEXT: v_mul_hi_u32 v10, v12, v0
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v1, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v8, s8, v6
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v5, v6
+; GFX9-NEXT: v_mul_lo_u32 v9, v4, v7
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0
+; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, s9, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v4
+; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v10, s9
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT: .LBB28_2: ; %Flow1
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB28_4
+; GFX9-NEXT: ; %bb.3:
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GFX9-NEXT: v_sub_u32_e32 v1, 0, v4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0
+; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v1, v0, v4
+; GFX9-NEXT: v_add_u32_e32 v5, 1, v0
+; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1
+; GFX9-NEXT: v_sub_u32_e32 v6, v1, v4
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, 1, v0
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: .LBB28_4:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v5, s11, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execz .LBB28_6
+; GFX9-NEXT: ; %bb.5:
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
+; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX9-NEXT: v_trunc_f32_e32 v5, v5
+; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4
+; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
+; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
+; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
+; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
+; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
+; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
+; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v8, s11, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s10, v4
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v10, s11
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v10, v5, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX9-NEXT: .LBB28_6: ; %Flow
+; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
+; GFX9-NEXT: s_cbranch_execz .LBB28_8
+; GFX9-NEXT: ; %bb.7:
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
+; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3
+; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4
+; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3
+; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2
+; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
+; GFX9-NEXT: v_sub_u32_e32 v4, s10, v4
+; GFX9-NEXT: v_sub_u32_e32 v6, v4, v2
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: .LBB28_8:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: bitcast_v4f32_to_v2i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v1, s5, v5
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v5
+; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, 0, v4
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX11-NEXT: v_trunc_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v1
+; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v6, v10, v12
+; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v14, v1, v6, v7
+; GFX11-NEXT: v_mul_hi_u32 v15, v13, v0
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v14, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v14, 0
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v15, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v12, v1, vcc_lo
+; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v7, v10, v12
+; GFX11-NEXT: v_mul_hi_u32 v11, v13, v0
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v10, v1, v7, v6
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v11, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
+; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v13, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v12, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_hi_u32 v11, s4, v8
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s5, v8, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s4, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s5, v10, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v11, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v1, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, v8
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_lo_u32 v8, v5, v6
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v6, 0
+; GFX11-NEXT: v_mul_lo_u32 v9, v4, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0
+; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0
+; GFX11-NEXT: v_sub_co_u32 v11, s0, v0, v4
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5
+; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v5
+; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v5
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
+; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v1 :: v_dual_cndmask_b32 v0, v6, v4
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: .LBB28_2: ; %Flow1
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1
+; GFX11-NEXT: s_cbranch_execz .LBB28_4
+; GFX11-NEXT: ; %bb.3:
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v1, v1, v0
+; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v1, v0, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, v1, v4
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_cndmask_b32 v0, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT: .LBB28_4:
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_or_b32_e32 v5, s7, v3
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execz .LBB28_6
+; GFX11-NEXT: ; %bb.5:
+; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GFX11-NEXT: v_sub_co_u32 v11, vcc_lo, 0, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v12, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX11-NEXT: v_trunc_f32_e32 v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
+; GFX11-NEXT: v_mul_lo_u32 v7, v12, v14
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v15, v5, v6, v7
+; GFX11-NEXT: v_mul_hi_u32 v16, v14, v4
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v15, 0
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v15, 0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v5, vcc_lo
+; GFX11-NEXT: v_mul_lo_u32 v6, v12, v14
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
+; GFX11-NEXT: v_mul_hi_u32 v12, v14, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add3_u32 v11, v5, v7, v6
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v11, 0
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v11, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v5
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v13, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_hi_u32 v11, s6, v8
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s7, v8, 0
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, s6, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s7, v10, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v11, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v4, v8
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_lo_u32 v8, v3, v6
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, 0
+; GFX11-NEXT: v_mul_lo_u32 v9, v2, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s6, v4
+; GFX11-NEXT: v_add3_u32 v5, v5, v9, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0
+; GFX11-NEXT: v_sub_co_u32 v11, s0, v4, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
+; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_cndmask_b32 v3, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v3 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: .LBB28_6: ; %Flow
+; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1
+; GFX11-NEXT: s_cbranch_execz .LBB28_8
+; GFX11-NEXT: ; %bb.7:
+; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v2
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
+; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4
+; GFX11-NEXT: v_mul_hi_u32 v3, s6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_lo_u32 v4, v3, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, s6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v6, v4, v2
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v5 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT: .LBB28_8:
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> poison, i32 0, i32 0)
%cast = bitcast <4 x float> %val to <2 x i64>
%div = udiv <2 x i64> %cast, %arg
@@ -301,8 +3419,44 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
declare half @llvm.canonicalize.f16(half)
-; CHECK-LABEL: {{^}}bitcast_f32_to_v1i32:
+
define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) {
+; GCN-LABEL: bitcast_f32_to_v1i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x387c0000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_f32_to_v1i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 0x387c0000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_f32_to_v1i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x387c0000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_f32_to_v1i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x387c0000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%f16 = call arcp afn half @llvm.canonicalize.f16(half 0xH03F0)
%f32 = fpext half %f16 to float
%v = bitcast float %f32 to <1 x i32>
@@ -311,8 +3465,91 @@ define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) {
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v4i64_to_v16i16:
+
define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
+; GCN-LABEL: bitcast_v4i64_to_v16i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v4i64_to_v16i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v4i64_to_v16i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v4i64_to_v16i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -329,8 +3566,130 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v4f64_to_v16f16:
+
define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
+; GCN-LABEL: bitcast_v4f64_to_v16f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s11, 0
+; GCN-NEXT: s_mov_b32 s18, 0
+; GCN-NEXT: s_mov_b32 s15, 0
+; GCN-NEXT: s_mov_b32 s19, 0
+; GCN-NEXT: s_mov_b32 s16, 0
+; GCN-NEXT: s_mov_b32 s20, 0
+; GCN-NEXT: s_mov_b32 s17, 0
+; GCN-NEXT: s_mov_b32 s21, 0
+; GCN-NEXT: s_mov_b32 s8, 0
+; GCN-NEXT: s_mov_b32 s12, 0
+; GCN-NEXT: s_mov_b32 s9, 0
+; GCN-NEXT: s_mov_b32 s13, 0
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b32 s14, 0
+; GCN-NEXT: s_mov_b32 s7, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, s18
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, s11
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, s19
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, s15
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, s20
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, s16
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, s21
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, s17
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, s12
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, s8
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, s13
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, s9
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, s14
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, s10
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, s7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v9, v8
+; GCN-NEXT: v_or_b32_e32 v5, v11, v10
+; GCN-NEXT: v_or_b32_e32 v6, v13, v12
+; GCN-NEXT: v_or_b32_e32 v7, v15, v14
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v4f64_to_v16f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v4f64_to_v16f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v4f64_to_v16f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -347,8 +3706,91 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v16i16_to_v4i64:
+
define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x i16> %value) {
+; GCN-LABEL: bitcast_v16i16_to_v4i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v16i16_to_v4i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v16i16_to_v4i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v16i16_to_v4i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -365,8 +3807,91 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v16f16_to_v4f64:
+
define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x half> %value) {
+; GCN-LABEL: bitcast_v16f16_to_v4f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v16f16_to_v4f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v16f16_to_v4f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v16f16_to_v4f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -383,9 +3908,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v20f16_to_v5f64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) {
+; GCN-LABEL: bitcast_v20f16_to_v5f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v20f16_to_v5f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v20f16_to_v5f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v20f16_to_v5f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -402,9 +4028,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v10f32_to_v5f64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+; GCN-LABEL: bitcast_v10f32_to_v5f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v10f32_to_v5f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v10f32_to_v5f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v10f32_to_v5f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -421,9 +4148,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v10i32_to_v5f64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+; GCN-LABEL: bitcast_v10i32_to_v5f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v10i32_to_v5f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v10i32_to_v5f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v10i32_to_v5f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -440,9 +4268,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v10f32_to_v5i64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) {
+; GCN-LABEL: bitcast_v10f32_to_v5i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v10f32_to_v5i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v10f32_to_v5i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v10f32_to_v5i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -459,9 +4388,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v10i32_to_v5i64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) {
+; GCN-LABEL: bitcast_v10i32_to_v5i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v10i32_to_v5i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v10i32_to_v5i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v10i32_to_v5i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -478,9 +4508,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v40i8_to_v5f64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+; GCN-LABEL: bitcast_v40i8_to_v5f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v40i8_to_v5f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v40i8_to_v5f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v40i8_to_v5f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -497,9 +4628,110 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v40i8_to_v5i64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) {
+; GCN-LABEL: bitcast_v40i8_to_v5i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v40i8_to_v5i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v40i8_to_v5i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v40i8_to_v5i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -516,9 +4748,109 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v5f64_to_v10f32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+; GCN-LABEL: bitcast_v5f64_to_v10f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v5f64_to_v10f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s7, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s7, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v5f64_to_v10f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v5f64_to_v10f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s7, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -535,9 +4867,109 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v5f64_to_v10i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) {
+; GCN-LABEL: bitcast_v5f64_to_v10i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v5f64_to_v10i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s7, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s7, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v5f64_to_v10i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v5f64_to_v10i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s7, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -554,9 +4986,109 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v5i64_to_v10f32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+; GCN-LABEL: bitcast_v5i64_to_v10f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v5i64_to_v10f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s7, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s7, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v5i64_to_v10f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v5i64_to_v10f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s7, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -573,9 +5105,109 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v5i64_to_v10i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) {
+; GCN-LABEL: bitcast_v5i64_to_v10i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s15, 0xf000
+; GCN-NEXT: s_mov_b32 s14, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0
+; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v5i64_to_v10i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s7, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s7, 0
+; VI-NEXT: s_add_u32 s8, s4, 16
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v5i64_to_v10i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v5i64_to_v10i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_mov_b32_e32 v8, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s7, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -592,9 +5224,115 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v6f64_to_v12i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+; GCN-LABEL: bitcast_v6f64_to_v12i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v6f64_to_v12i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v6f64_to_v12i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v6f64_to_v12i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -611,9 +5349,115 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v6f64_to_v12f32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) {
+; GCN-LABEL: bitcast_v6f64_to_v12f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v6f64_to_v12f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v6f64_to_v12f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v6f64_to_v12f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -630,9 +5474,117 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v12i32_to_v6i64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+; GCN-LABEL: bitcast_v12i32_to_v6i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v12i32_to_v6i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v12i32_to_v6i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v12i32_to_v6i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -649,9 +5601,117 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v12i32_to_v6f64:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) {
+; GCN-LABEL: bitcast_v12i32_to_v6f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v12i32_to_v6f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v12i32_to_v6f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v12i32_to_v6f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -668,9 +5728,115 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v6i64_to_v12i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) {
+; GCN-LABEL: bitcast_v6i64_to_v12i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s1, s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v6i64_to_v12i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s9, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s9, 0
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v6i64_to_v12i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v6i64_to_v12i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -687,9 +5853,131 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v7i64_to_v14i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) {
+; GCN-LABEL: bitcast_v7i64_to_v14i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: v_mov_b32_e32 v13, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v7i64_to_v14i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s12, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s13, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s8, s4, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v7i64_to_v14i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v7i64_to_v14i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -706,9 +5994,131 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v7f64_to_v14i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) {
+; GCN-LABEL: bitcast_v7f64_to_v14i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s19, 0xf000
+; GCN-NEXT: s_mov_b32 s18, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s0
+; GCN-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NEXT: v_mov_b32_e32 v7, s0
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: v_mov_b32_e32 v13, s0
+; GCN-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NEXT: v_mov_b32_e32 v9, s0
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v7f64_to_v14i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s12, s4, 16
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s13, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s8, s4, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s9, s5, 0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v7f64_to_v14i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v7f64_to_v14i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_mov_b32_e32 v10, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5]
+; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -725,9 +6135,156 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v9i64_to_v18i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) {
+; GCN-LABEL: bitcast_v9i64_to_v18i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:64
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v9i64_to_v18i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s16, s4, 48
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s17, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: s_add_u32 s12, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s13, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 64
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v9i64_to_v18i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[20:21] offset:64
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v9i64_to_v18i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v18, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_mov_b32_e32 v16, s0
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_store_b128 v18, v[0:3], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v18, v[4:7], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v18, v[8:11], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v18, v[12:15], s[4:5]
+; GFX11-NEXT: global_store_b64 v18, v[16:17], s[4:5] offset:64
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -744,9 +6301,163 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v10i64_to_v20i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) {
+; GCN-LABEL: bitcast_v10i64_to_v20i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NEXT: v_mov_b32_e32 v19, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v10i64_to_v20i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s18, s4, 48
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s19, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s18
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s19
+; VI-NEXT: s_add_u32 s14, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s15, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s14
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s15
+; VI-NEXT: s_add_u32 s14, s4, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s15, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s14
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s15
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s4, 64
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v10i64_to_v20i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:64
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v10i64_to_v20i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_mov_b32_e32 v18, s0
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5]
+; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:64
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -763,9 +6474,183 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v11i64_to_v20i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) {
+; GCN-LABEL: bitcast_v11i64_to_v20i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v11i64_to_v20i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s20, s4, 48
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s21, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: s_add_u32 s16, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s17, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: s_add_u32 s10, s4, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_mov_b32 s6, s0
+; VI-NEXT: s_mov_b32 s7, s0
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 0x50
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_u32 s0, s4, 64
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v11i64_to_v20i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[24:25] offset:80
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:64
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v11i64_to_v20i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v21, s0
+; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_mov_b32_e32 v18, s0
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5]
+; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:64
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -782,9 +6667,185 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v12i64_to_v22i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) {
+; GCN-LABEL: bitcast_v12i64_to_v22i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NEXT: v_mov_b32_e32 v19, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v12i64_to_v22i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s22, s4, 0x50
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s23, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s22
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s23
+; VI-NEXT: s_add_u32 s18, s4, 64
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s19, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s18
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s19
+; VI-NEXT: s_add_u32 s14, s4, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s15, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s14
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s15
+; VI-NEXT: s_add_u32 s10, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s11, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s10
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s11
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v12i64_to_v22i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[26:27], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:80
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:64
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v12i64_to_v22i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v21, s0
+; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s0
+; GFX11-NEXT: v_mov_b32_e32 v22, s0
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v24, v[0:3], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v24, v[4:7], s[4:5] offset:64
+; GFX11-NEXT: global_store_b128 v24, v[8:11], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v24, v[12:15], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v24, v[16:19], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v24, v[20:23], s[4:5]
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -801,9 +6862,204 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v13i64_to_v24i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) {
+; GCN-LABEL: bitcast_v13i64_to_v24i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NEXT: v_mov_b32_e32 v19, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:96
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v13i64_to_v24i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_add_u32 s24, s4, 0x50
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_addc_u32 s25, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s24
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s25
+; VI-NEXT: s_add_u32 s20, s4, 64
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s21, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s20
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s21
+; VI-NEXT: s_add_u32 s16, s4, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s17, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s17
+; VI-NEXT: s_add_u32 s12, s4, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s13, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s13
+; VI-NEXT: s_add_u32 s6, s4, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s7, s5, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_add_u32 s0, s4, 0x60
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v13i64_to_v24i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:80
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:64
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[28:29] offset:96
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v13i64_to_v24i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_mov_b32_e32 v18, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT: v_mov_b32_e32 v5, s0
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5]
+; GFX11-NEXT: global_store_b64 v20, v[4:5], s[4:5] offset:96
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -820,9 +7076,211 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v14i64_to_v26i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) {
+; GCN-LABEL: bitcast_v14i64_to_v26i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NEXT: v_mov_b32_e32 v19, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v14i64_to_v26i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s4, s0, 0x50
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 64
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s0, 0x60
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v14i64_to_v26i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:80
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:64
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:96
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v14i64_to_v26i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_mov_b32_e32 v18, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0
+; GFX11-NEXT: v_mov_b32_e32 v7, s0
+; GFX11-NEXT: s_clause 0x4
+; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5]
+; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:96
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -839,9 +7297,231 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}bitcast_v15i64_to_v26i32:
-; CHECK: ScratchSize: 0
+
+
define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) {
+; GCN-LABEL: bitcast_v15i64_to_v26i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: v_mov_b32_e32 v8, s6
+; GCN-NEXT: v_mov_b32_e32 v9, s6
+; GCN-NEXT: v_mov_b32_e32 v10, s6
+; GCN-NEXT: v_mov_b32_e32 v11, s6
+; GCN-NEXT: v_mov_b32_e32 v12, s6
+; GCN-NEXT: v_mov_b32_e32 v13, s6
+; GCN-NEXT: v_mov_b32_e32 v14, s6
+; GCN-NEXT: v_mov_b32_e32 v15, s6
+; GCN-NEXT: v_mov_b32_e32 v16, s6
+; GCN-NEXT: v_mov_b32_e32 v17, s6
+; GCN-NEXT: v_mov_b32_e32 v18, s6
+; GCN-NEXT: v_mov_b32_e32 v19, s6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NEXT: v_mov_b32_e32 v20, s6
+; GCN-NEXT: v_mov_b32_e32 v21, s6
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s6
+; GCN-NEXT: v_mov_b32_e32 v6, s6
+; GCN-NEXT: v_mov_b32_e32 v7, s6
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[20:21], off, s[0:3], 0 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NEXT: s_endpgm
+;
+; VI-LABEL: bitcast_v15i64_to_v26i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s0, s[4:5], 0x24
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s2
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s4, s0, 0x50
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 64
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 48
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 32
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s4, s0, 16
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_addc_u32 s5, s1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_mov_b32 s15, s2
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_add_u32 s2, s0, 0x70
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_add_u32 s0, s0, 0x60
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: v_mov_b32_e32 v3, s15
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: bitcast_v15i64_to_v26i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[34:35] offset:112
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitcast_v15i64_to_v26i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0
+; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0
+; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0
+; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0
+; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0
+; GFX11-NEXT: v_mov_b32_e32 v18, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:64
+; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v20, s0
+; GFX11-NEXT: v_dual_mov_b32 v21, s0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0
+; GFX11-NEXT: v_mov_b32_e32 v7, s0
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5]
+; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:112
+; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:96
+; GFX11-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -858,8 +7538,70 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_i32:
+
define void @v_bitcast_v2bf16_to_i32(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v2bf16_to_i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB59_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
+; GCN-NEXT: .LBB59_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2bf16_to_i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2bf16_to_i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v4, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2bf16_to_i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v4, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -874,8 +7616,70 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2i16:
+
define void @v_bitcast_v2bf16_to_v2i16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v2bf16_to_v2i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB60_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
+; GCN-NEXT: .LBB60_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2bf16_to_v2i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2bf16_to_v2i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2bf16_to_v2i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -890,8 +7694,77 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2f16:
+
define void @v_bitcast_v2bf16_to_v2f16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v2bf16_to_v2f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB61_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GCN-NEXT: .LBB61_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2bf16_to_v2f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2bf16_to_v2f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2bf16_to_v2f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -906,8 +7779,70 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v4i8:
+
define void @v_bitcast_v2bf16_to_v4i8(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v2bf16_to_v4i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB62_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16
+; GCN-NEXT: .LBB62_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2bf16_to_v4i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2bf16_to_v4i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v4, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2bf16_to_v4i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v4, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -922,8 +7857,86 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3i16:
+
define void @v_bitcast_v3bf16_to_v3i16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v3bf16_to_v3i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v6
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB63_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v6, v4, v3, 16
+; GCN-NEXT: .LBB63_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v3bf16_to_v3i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_short v[3:4], v6
+; VI-NEXT: flat_store_dword v[1:2], v5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v3bf16_to_v3i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
+; GFX9-NEXT: global_store_dword v[1:2], v5, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v3bf16_to_v3i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
+; GFX11-NEXT: global_store_b32 v[1:2], v5, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -938,8 +7951,96 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3f16:
+
define void @v_bitcast_v3bf16_to_v3f16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v3bf16_to_v3f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB64_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GCN-NEXT: .LBB64_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6
+; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v3bf16_to_v3f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_short v[3:4], v6
+; VI-NEXT: flat_store_dword v[1:2], v5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v3bf16_to_v3f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
+; GFX9-NEXT: global_store_dword v[1:2], v5, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v3bf16_to_v3f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
+; GFX11-NEXT: global_store_b32 v[1:2], v5, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -954,8 +8055,73 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_i32_to_v2bf16:
+
define void @v_bitcast_i32_to_v2bf16(i32 %cond, ptr addrspace(1) %out, i32 %value) {
+; GCN-LABEL: v_bitcast_i32_to_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB65_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: .LBB65_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_i32_to_v2bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_i32_to_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_i32_to_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -970,8 +8136,73 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2i16_to_v2bf16:
+
define void @v_bitcast_v2i16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x i16> %value) {
+; GCN-LABEL: v_bitcast_v2i16_to_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB66_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GCN-NEXT: .LBB66_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2i16_to_v2bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2i16_to_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2i16_to_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -986,8 +8217,75 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2f16_to_v2bf16:
+
define void @v_bitcast_v2f16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x half> %value) {
+; GCN-LABEL: v_bitcast_v2f16_to_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB67_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GCN-NEXT: .LBB67_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2f16_to_v2bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2f16_to_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2f16_to_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1002,8 +8300,99 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4i8_to_v2bf16:
+
define void @v_bitcast_v4i8_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <4 x i8> %value) {
+; GCN-LABEL: v_bitcast_v4i8_to_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB68_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v0, v3, v0
+; GCN-NEXT: v_or_b32_e32 v7, v5, v4
+; GCN-NEXT: .LBB68_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4i8_to_v2bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v7, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dword v[1:2], v7
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4i8_to_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4i8_to_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB68_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: .LBB68_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1018,8 +8407,90 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v3i16_to_v3bf16:
+
define void @v_bitcast_v3i16_to_v3bf16(i32 %cond, ptr addrspace(1) %out, <3 x i16> %value) {
+; GCN-LABEL: v_bitcast_v3i16_to_v3bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB69_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GCN-NEXT: .LBB69_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v3i16_to_v3bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_short v[3:4], v6
+; VI-NEXT: flat_store_dword v[1:2], v5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v3i16_to_v3bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4
+; GFX9-NEXT: global_store_dword v[1:2], v5, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v3i16_to_v3bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4
+; GFX11-NEXT: global_store_b32 v[1:2], v5, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1034,8 +8505,95 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4f16:
+
define void @v_bitcast_v4bf16_to_v4f16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_v4f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB70_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GCN-NEXT: .LBB70_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v3, v0, v3
+; GCN-NEXT: v_or_b32_e32 v4, v5, v4
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_v4f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_v4f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_v4f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1050,8 +8608,81 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4i16:
+
define void @v_bitcast_v4bf16_to_v4i16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_v4i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v8, v7
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB71_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB71_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_v4i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_v4i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_v4i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1066,8 +8697,81 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2i32:
+
define void @v_bitcast_v4bf16_to_v2i32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_v2i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v8, v7
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB72_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB72_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_v2i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_v2i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_v2i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1082,8 +8786,81 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2f32:
+
define void @v_bitcast_v4bf16_to_v2f32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_v2f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v8, v7
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB73_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB73_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_v2f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_v2f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_v2f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1098,8 +8875,80 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_f64:
+
define void @v_bitcast_v4bf16_to_f64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB74_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB74_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1114,8 +8963,80 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_i64:
+
define void @v_bitcast_v4bf16_to_i64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB75_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB75_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1130,8 +9051,81 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v8i8:
+
define void @v_bitcast_v4bf16_to_v8i8(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v4bf16_to_v8i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v8, v7
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB76_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: .LBB76_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4bf16_to_v8i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4bf16_to_v8i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4bf16_to_v8i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1146,8 +9140,87 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_i64_to_v4bf16:
+
define void @v_bitcast_i64_to_v4bf16(i32 %cond, ptr addrspace(1) %out, i64 %value) {
+; GCN-LABEL: v_bitcast_i64_to_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB77_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GCN-NEXT: .LBB77_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_i64_to_v4bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_i64_to_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_i64_to_v4bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1162,8 +9235,87 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2f32_to_v4bf16:
+
define void @v_bitcast_v2f32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x float> %value) {
+; GCN-LABEL: v_bitcast_v2f32_to_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB78_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GCN-NEXT: .LBB78_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2f32_to_v4bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2f32_to_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2f32_to_v4bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1178,8 +9330,87 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2i32_to_v4bf16:
+
define void @v_bitcast_v2i32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x i32> %value) {
+; GCN-LABEL: v_bitcast_v2i32_to_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB79_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GCN-NEXT: .LBB79_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2i32_to_v4bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2i32_to_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2i32_to_v4bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1194,8 +9425,87 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4i16_to_v4bf16:
+
define void @v_bitcast_v4i16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x i16> %value) {
+; GCN-LABEL: v_bitcast_v4i16_to_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB80_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GCN-NEXT: .LBB80_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4i16_to_v4bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4i16_to_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4i16_to_v4bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1210,8 +9520,91 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4f16_to_v4bf16:
+
define void @v_bitcast_v4f16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x half> %value) {
+; GCN-LABEL: v_bitcast_v4f16_to_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB81_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GCN-NEXT: .LBB81_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4f16_to_v4bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4f16_to_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4f16_to_v4bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1226,8 +9619,96 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6i16:
+
define void @v_bitcast_v6bf16_to_v6i16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v6bf16_to_v6i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v10, v9
+; GCN-NEXT: v_mov_b32_e32 v0, v9
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB82_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16
+; GCN-NEXT: .LBB82_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v6bf16_to_v6i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v9, v6
+; VI-NEXT: v_mov_b32_e32 v8, v5
+; VI-NEXT: v_mov_b32_e32 v7, v4
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v6bf16_to_v6i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v6bf16_to_v6i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v9, v6
+; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1242,8 +9723,117 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6f16:
+
define void @v_bitcast_v6bf16_to_v6f16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v6bf16_to_v6f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB83_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GCN-NEXT: .LBB83_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v4, v3
+; GCN-NEXT: v_or_b32_e32 v4, v6, v5
+; GCN-NEXT: v_or_b32_e32 v0, v7, v0
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v6bf16_to_v6f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v9, v6
+; VI-NEXT: v_mov_b32_e32 v8, v5
+; VI-NEXT: v_mov_b32_e32 v7, v4
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v6bf16_to_v6f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v6bf16_to_v6f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v9, v6
+; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1258,8 +9848,93 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v12i8:
+
define void @v_bitcast_v6bf16_to_v12i8(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v6bf16_to_v12i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v10, v9
+; GCN-NEXT: v_mov_b32_e32 v0, v9
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB84_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16
+; GCN-NEXT: .LBB84_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v6bf16_to_v12i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v8, v5
+; VI-NEXT: v_mov_b32_e32 v7, v4
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v6bf16_to_v12i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v6bf16_to_v12i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1274,8 +9949,111 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v6f16_to_v6bf16:
+
define void @v_bitcast_v6f16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x half> %value) {
+; GCN-LABEL: v_bitcast_v6f16_to_v6bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB85_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GCN-NEXT: .LBB85_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v6f16_to_v6bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v9, v6
+; VI-NEXT: v_mov_b32_e32 v8, v5
+; VI-NEXT: v_mov_b32_e32 v7, v4
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v6f16_to_v6bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v6f16_to_v6bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v9, v6
+; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1290,8 +10068,105 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v6i16_to_v6bf16:
+
define void @v_bitcast_v6i16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x i16> %value) {
+; GCN-LABEL: v_bitcast_v6i16_to_v6bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB86_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8
+; GCN-NEXT: .LBB86_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v6i16_to_v6bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v6, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v9, v6
+; VI-NEXT: v_mov_b32_e32 v8, v5
+; VI-NEXT: v_mov_b32_e32 v7, v4
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v6i16_to_v6bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v6
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-NEXT: v_mov_b32_e32 v7, v4
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v6i16_to_v6bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_mov_b32_e32 v9, v6
+; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1306,8 +10181,173 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v12i8_to_v6bf16:
+
define void @v_bitcast_v12i8_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <12 x i8> %value) {
+; GCN-LABEL: v_bitcast_v12i8_to_v6bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB87_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
+; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
+; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v6, v6, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT: v_or_b32_e32 v16, v3, v0
+; GCN-NEXT: v_or_b32_e32 v18, v5, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GCN-NEXT: v_or_b32_e32 v19, v9, v7
+; GCN-NEXT: v_or_b32_e32 v15, v11, v8
+; GCN-NEXT: v_or_b32_e32 v0, v13, v10
+; GCN-NEXT: .LBB87_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v12i8_to_v6bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v15, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v16, v15
+; VI-NEXT: v_mov_b32_e32 v17, v15
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB87_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: .LBB87_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx3 v[1:2], v[15:17]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v12i8_to_v6bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v15, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-NEXT: v_mov_b32_e32 v17, v15
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB87_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: .LBB87_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx3 v[1:2], v[15:17], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v12i8_to_v6bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v15, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v16, v15
+; GFX11-NEXT: v_mov_b32_e32 v17, v15
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB87_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-NEXT: v_or_b32_e32 v5, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v6, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v7, v11, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_or_b32_e32 v15, v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v16, v3, v5
+; GFX11-NEXT: v_or_b32_e32 v17, v6, v7
+; GFX11-NEXT: .LBB87_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b96 v[1:2], v[15:17], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1322,8 +10362,102 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2f64:
+
define void @v_bitcast_v8bf16_to_v2f64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v2f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v12, v11
+; GCN-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NEXT: v_mov_b32_e32 v14, v11
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB88_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
+; GCN-NEXT: .LBB88_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v2f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v2f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v2f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1338,8 +10472,102 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2i64:
+
define void @v_bitcast_v8bf16_to_v2i64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v2i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v12, v11
+; GCN-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NEXT: v_mov_b32_e32 v14, v11
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB89_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
+; GCN-NEXT: .LBB89_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v2i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v2i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v2i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1354,8 +10582,102 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4f32:
+
define void @v_bitcast_v8bf16_to_v4f32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v4f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v12, v11
+; GCN-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NEXT: v_mov_b32_e32 v14, v11
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB90_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
+; GCN-NEXT: .LBB90_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v4f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v4f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v4f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1370,8 +10692,102 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4i32:
+
define void @v_bitcast_v8bf16_to_v4i32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v4i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v12, v11
+; GCN-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NEXT: v_mov_b32_e32 v14, v11
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB91_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
+; GCN-NEXT: .LBB91_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v4i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v4i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v4i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1386,8 +10802,130 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8f16:
+
define void @v_bitcast_v8bf16_to_v8f16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v8f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB92_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9
+; GCN-NEXT: .LBB92_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v4, v3
+; GCN-NEXT: v_or_b32_e32 v4, v6, v5
+; GCN-NEXT: v_or_b32_e32 v5, v8, v7
+; GCN-NEXT: v_or_b32_e32 v6, v9, v0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v8f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v8f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v8f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1402,8 +10940,102 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8i16:
+
define void @v_bitcast_v8bf16_to_v8i16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v8bf16_to_v8i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v12, v11
+; GCN-NEXT: v_mov_b32_e32 v13, v11
+; GCN-NEXT: v_mov_b32_e32 v14, v11
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB93_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16
+; GCN-NEXT: .LBB93_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8bf16_to_v8i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8bf16_to_v8i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8bf16_to_v8i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1418,8 +11050,122 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8f16_to_v8bf16:
+
define void @v_bitcast_v8f16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x half> %value) {
+; GCN-LABEL: v_bitcast_v8f16_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB94_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; GCN-NEXT: .LBB94_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8f16_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8f16_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8f16_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1434,8 +11180,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8i16_to_v8bf16:
+
define void @v_bitcast_v8i16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x i16> %value) {
+; GCN-LABEL: v_bitcast_v8i16_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB95_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10
+; GCN-NEXT: .LBB95_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8i16_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8i16_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8i16_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1450,8 +11302,208 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16i8_to_v8bf16:
+
define void @v_bitcast_v16i8_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <16 x i8> %value) {
+; GCN-LABEL: v_bitcast_v16i8_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB96_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
+; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
+; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
+; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16
+; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v6, v6, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT: v_or_b32_e32 v12, v14, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GCN-NEXT: v_or_b32_e32 v19, v3, v0
+; GCN-NEXT: v_or_b32_e32 v23, v5, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v6
+; GCN-NEXT: v_or_b32_e32 v24, v9, v7
+; GCN-NEXT: v_or_b32_e32 v21, v11, v8
+; GCN-NEXT: v_or_b32_e32 v25, v13, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v12
+; GCN-NEXT: v_or_b32_e32 v0, v17, v14
+; GCN-NEXT: .LBB96_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16i8_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB96_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: .LBB96_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16i8_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB96_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18
+; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: .LBB96_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16i8_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB96_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v12
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_lshlrev_b16 v6, 8, v10
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v13
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v14
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v15
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v16
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v17
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v18
+; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v7, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v8, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v14
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v19, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v20, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v21, v6, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v22, v8, v9
+; GFX11-NEXT: .LBB96_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1466,8 +11518,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2i64_to_v8bf16:
+
define void @v_bitcast_v2i64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
+; GCN-LABEL: v_bitcast_v2i64_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB97_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT: .LBB97_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2i64_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2i64_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2i64_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1482,8 +11640,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v2f64_to_v8bf16:
+
define void @v_bitcast_v2f64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
+; GCN-LABEL: v_bitcast_v2f64_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB98_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT: .LBB98_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v2f64_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v2f64_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v2f64_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1498,8 +11762,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4i32_to_v8bf16:
+
define void @v_bitcast_v4i32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x i32> %value) {
+; GCN-LABEL: v_bitcast_v4i32_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB99_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT: .LBB99_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4i32_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4i32_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4i32_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1514,8 +11884,114 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4f32_to_v8bf16:
+
define void @v_bitcast_v4f32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x float> %value) {
+; GCN-LABEL: v_bitcast_v4f32_to_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v7, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB100_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT: .LBB100_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4f32_to_v8bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v10, v7
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v10, v6
+; VI-NEXT: v_mov_b32_e32 v9, v5
+; VI-NEXT: v_mov_b32_e32 v8, v4
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4f32_to_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4f32_to_v8bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v7, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v9, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5
+; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1530,8 +12006,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16i16:
+
define void @v_bitcast_v16bf16_to_v16i16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v16i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB101_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB101_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v16i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v16i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v16i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1546,8 +12165,207 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16f16:
+
define void @v_bitcast_v16bf16_to_v16f16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v16f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB102_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v33, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17
+; GCN-NEXT: .LBB102_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v31
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v33
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v29
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v4, v3
+; GCN-NEXT: v_or_b32_e32 v4, v6, v5
+; GCN-NEXT: v_or_b32_e32 v5, v8, v7
+; GCN-NEXT: v_or_b32_e32 v6, v10, v9
+; GCN-NEXT: v_or_b32_e32 v7, v12, v11
+; GCN-NEXT: v_or_b32_e32 v8, v14, v13
+; GCN-NEXT: v_or_b32_e32 v9, v16, v15
+; GCN-NEXT: v_or_b32_e32 v10, v17, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v16f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v16f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v16f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1562,8 +12380,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8i32:
+
define void @v_bitcast_v16bf16_to_v8i32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v8i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB103_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB103_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v8i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v8i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v8i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1578,8 +12539,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8f32:
+
define void @v_bitcast_v16bf16_to_v8f32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v8f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB104_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB104_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v8f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v8f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v8f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1594,8 +12698,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4f64:
+
define void @v_bitcast_v16bf16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v4f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB105_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB105_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v4f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v4f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v4f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1610,8 +12857,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4i64:
+
define void @v_bitcast_v16bf16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v4i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB106_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB106_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v4i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v4i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v4i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1626,8 +13016,151 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v32i8:
+
define void @v_bitcast_v16bf16_to_v32i8(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v16bf16_to_v32i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v20, v19
+; GCN-NEXT: v_mov_b32_e32 v21, v19
+; GCN-NEXT: v_mov_b32_e32 v22, v19
+; GCN-NEXT: v_mov_b32_e32 v23, v19
+; GCN-NEXT: v_mov_b32_e32 v24, v19
+; GCN-NEXT: v_mov_b32_e32 v25, v19
+; GCN-NEXT: v_mov_b32_e32 v26, v19
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB107_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16
+; GCN-NEXT: .LBB107_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16bf16_to_v32i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16bf16_to_v32i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16bf16_to_v32i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1642,8 +13175,175 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8f32_to_v16bf16:
+
define void @v_bitcast_v8f32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x float> %value) {
+; GCN-LABEL: v_bitcast_v8f32_to_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB108_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
+; GCN-NEXT: .LBB108_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
+; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8f32_to_v16bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8f32_to_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8f32_to_v16bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1658,8 +13358,175 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8i32_to_v16bf16:
+
define void @v_bitcast_v8i32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x i32> %value) {
+; GCN-LABEL: v_bitcast_v8i32_to_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB109_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
+; GCN-NEXT: .LBB109_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
+; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8i32_to_v16bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8i32_to_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8i32_to_v16bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1674,8 +13541,175 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4i64_to_v16bf16:
+
define void @v_bitcast_v4i64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
+; GCN-LABEL: v_bitcast_v4i64_to_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB110_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
+; GCN-NEXT: .LBB110_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
+; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4i64_to_v16bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4i64_to_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4i64_to_v16bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1690,8 +13724,175 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v4f64_to_v16bf16:
+
define void @v_bitcast_v4f64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
+; GCN-LABEL: v_bitcast_v4f64_to_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v15, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB111_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3
+; GCN-NEXT: .LBB111_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16
+; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v4f64_to_v16bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v12, v11
+; VI-NEXT: v_mov_b32_e32 v13, v11
+; VI-NEXT: v_mov_b32_e32 v14, v11
+; VI-NEXT: v_mov_b32_e32 v15, v11
+; VI-NEXT: v_mov_b32_e32 v16, v11
+; VI-NEXT: v_mov_b32_e32 v17, v11
+; VI-NEXT: v_mov_b32_e32 v18, v11
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v18, v10
+; VI-NEXT: v_mov_b32_e32 v17, v9
+; VI-NEXT: v_mov_b32_e32 v16, v8
+; VI-NEXT: v_mov_b32_e32 v15, v7
+; VI-NEXT: v_mov_b32_e32 v14, v6
+; VI-NEXT: v_mov_b32_e32 v13, v5
+; VI-NEXT: v_mov_b32_e32 v12, v4
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: ; %bb.2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v4f64_to_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mov_b32_e32 v14, v11
+; GFX9-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-NEXT: v_mov_b32_e32 v16, v11
+; GFX9-NEXT: v_mov_b32_e32 v17, v11
+; GFX9-NEXT: v_mov_b32_e32 v18, v11
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
+; GFX9-NEXT: v_mov_b32_e32 v17, v9
+; GFX9-NEXT: v_mov_b32_e32 v16, v8
+; GFX9-NEXT: v_mov_b32_e32 v15, v7
+; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: ; %bb.2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v4f64_to_v16bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v11, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v12, v11
+; GFX11-NEXT: v_mov_b32_e32 v13, v11
+; GFX11-NEXT: v_mov_b32_e32 v14, v11
+; GFX11-NEXT: v_mov_b32_e32 v15, v11
+; GFX11-NEXT: v_mov_b32_e32 v16, v11
+; GFX11-NEXT: v_mov_b32_e32 v17, v11
+; GFX11-NEXT: v_mov_b32_e32 v18, v11
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9
+; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7
+; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5
+; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT: ; %bb.2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1706,8 +13907,373 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32i8_to_v16bf16:
+
define void @v_bitcast_v32i8_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <32 x i8> %value) {
+; GCN-LABEL: v_bitcast_v32i8_to_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB112_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10
+; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12
+; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14
+; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16
+; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18
+; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v20
+; GCN-NEXT: v_and_b32_e32 v20, 0xff, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v22
+; GCN-NEXT: v_and_b32_e32 v22, 0xff, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v24
+; GCN-NEXT: v_and_b32_e32 v24, 0xff, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v26
+; GCN-NEXT: v_and_b32_e32 v26, 0xff, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v28
+; GCN-NEXT: v_and_b32_e32 v28, 0xff, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v30, 0xff, v38
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v37
+; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_or_b32_e32 v6, v6, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT: v_or_b32_e32 v12, v14, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT: v_or_b32_e32 v18, v22, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GCN-NEXT: v_or_b32_e32 v24, v30, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32
+; GCN-NEXT: v_or_b32_e32 v50, v3, v0
+; GCN-NEXT: v_or_b32_e32 v54, v5, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v6
+; GCN-NEXT: v_or_b32_e32 v55, v9, v7
+; GCN-NEXT: v_or_b32_e32 v52, v11, v8
+; GCN-NEXT: v_or_b32_e32 v40, v13, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v12
+; GCN-NEXT: v_or_b32_e32 v41, v17, v14
+; GCN-NEXT: v_or_b32_e32 v33, v19, v15
+; GCN-NEXT: v_or_b32_e32 v39, v21, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18
+; GCN-NEXT: v_or_b32_e32 v48, v25, v20
+; GCN-NEXT: v_or_b32_e32 v35, v27, v22
+; GCN-NEXT: v_or_b32_e32 v49, v29, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v24
+; GCN-NEXT: v_or_b32_e32 v0, v31, v26
+; GCN-NEXT: .LBB112_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v53
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v0, v17, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32i8_to_v16bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32
+; VI-NEXT: v_mov_b32_e32 v31, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v32, v31
+; VI-NEXT: v_mov_b32_e32 v33, v31
+; VI-NEXT: v_mov_b32_e32 v34, v31
+; VI-NEXT: v_mov_b32_e32 v35, v31
+; VI-NEXT: v_mov_b32_e32 v36, v31
+; VI-NEXT: v_mov_b32_e32 v37, v31
+; VI-NEXT: v_mov_b32_e32 v38, v31
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB112_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v31, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v32, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v33, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18
+; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v34, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v22
+; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v26
+; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v36, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v30
+; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v37, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v48
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v38, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: .LBB112_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32i8_to_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32
+; GFX9-NEXT: v_mov_b32_e32 v31, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v32, v31
+; GFX9-NEXT: v_mov_b32_e32 v33, v31
+; GFX9-NEXT: v_mov_b32_e32 v34, v31
+; GFX9-NEXT: v_mov_b32_e32 v35, v31
+; GFX9-NEXT: v_mov_b32_e32 v36, v31
+; GFX9-NEXT: v_mov_b32_e32 v37, v31
+; GFX9-NEXT: v_mov_b32_e32 v38, v31
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB112_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: v_perm_b32 v31, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10
+; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v32, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14
+; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v33, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18
+; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v34, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v22
+; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v35, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v26
+; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v36, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v30
+; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v37, v3, v0, s6
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v38, v3, v0, s6
+; GFX9-NEXT: .LBB112_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32i8_to_v16bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v50, off, s32
+; GFX11-NEXT: v_mov_b32_e32 v31, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v32, v31
+; GFX11-NEXT: v_mov_b32_e32 v33, v31
+; GFX11-NEXT: v_mov_b32_e32 v34, v31
+; GFX11-NEXT: v_mov_b32_e32 v35, v31
+; GFX11-NEXT: v_mov_b32_e32 v36, v31
+; GFX11-NEXT: v_mov_b32_e32 v37, v31
+; GFX11-NEXT: v_mov_b32_e32 v38, v31
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB112_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-NEXT: v_or_b32_e32 v5, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v6, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v7, v11, v12
+; GFX11-NEXT: v_perm_b32 v31, v4, v0, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v15
+; GFX11-NEXT: v_perm_b32 v32, v5, v3, 0x5040100
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v16
+; GFX11-NEXT: v_perm_b32 v33, v7, v6, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v17
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v18
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v19
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v21
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v22
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v23
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v24
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v26
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v27
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v28
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v29
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v50
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v49
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v34, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v35, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v36, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v37, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v38, v11, v10, 0x5040100
+; GFX11-NEXT: .LBB112_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1722,8 +14288,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8i64:
+
define void @v_bitcast_v32bf16_to_v8i64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v8i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB113_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB113_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v8i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB113_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB113_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v8i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB113_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB113_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v8i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB113_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB113_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1738,8 +14562,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8f64:
+
define void @v_bitcast_v32bf16_to_v8f64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v8f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB114_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB114_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v8f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB114_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB114_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v8f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB114_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB114_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v8f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB114_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB114_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1754,8 +14836,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16i32:
+
define void @v_bitcast_v32bf16_to_v16i32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v16i32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB115_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB115_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v16i32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB115_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB115_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v16i32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB115_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB115_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v16i32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB115_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB115_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1770,8 +15110,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16f32:
+
define void @v_bitcast_v32bf16_to_v16f32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v16f32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB116_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB116_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v16f32:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB116_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB116_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v16f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB116_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB116_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v16f32:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB116_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB116_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1786,8 +15384,407 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32f16:
+
define void @v_bitcast_v32bf16_to_v32f16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v32f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v60, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v61, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v62, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v63, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB117_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v59
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v46, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v61, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v62, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v58, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v63, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v42, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v43, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v44, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v45, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v38, v18
+; GCN-NEXT: v_cvt_f32_f16_e32 v50, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v39, v20
+; GCN-NEXT: v_cvt_f32_f16_e32 v51, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v48, v22
+; GCN-NEXT: v_cvt_f32_f16_e32 v52, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24
+; GCN-NEXT: v_cvt_f32_f16_e32 v53, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v26
+; GCN-NEXT: v_cvt_f32_f16_e32 v35, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28
+; GCN-NEXT: v_cvt_f32_f16_e32 v36, v34
+; GCN-NEXT: v_cvt_f32_f16_e32 v33, v30
+; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37
+; GCN-NEXT: v_cvt_f32_f16_e32 v34, v47
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29
+; GCN-NEXT: .LBB117_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v46
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v61
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v62
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v63
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v42
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v54
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v43
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v44
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v45
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v41
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v50
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v51
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v52
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v53
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v35
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v31
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v36
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v37
+; GCN-NEXT: v_cvt_f16_f32_e32 v32, v33
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v33, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v3, v4, v3
+; GCN-NEXT: v_or_b32_e32 v4, v6, v5
+; GCN-NEXT: v_or_b32_e32 v5, v8, v7
+; GCN-NEXT: v_or_b32_e32 v6, v10, v9
+; GCN-NEXT: v_or_b32_e32 v7, v12, v11
+; GCN-NEXT: v_or_b32_e32 v8, v14, v13
+; GCN-NEXT: v_or_b32_e32 v9, v16, v15
+; GCN-NEXT: v_or_b32_e32 v10, v18, v17
+; GCN-NEXT: v_or_b32_e32 v11, v20, v19
+; GCN-NEXT: v_or_b32_e32 v12, v22, v21
+; GCN-NEXT: v_or_b32_e32 v13, v24, v23
+; GCN-NEXT: v_or_b32_e32 v14, v26, v25
+; GCN-NEXT: v_or_b32_e32 v15, v28, v27
+; GCN-NEXT: v_or_b32_e32 v16, v30, v29
+; GCN-NEXT: v_or_b32_e32 v17, v32, v31
+; GCN-NEXT: v_or_b32_e32 v18, v33, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v32f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB117_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB117_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v32f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB117_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB117_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v32f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB117_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB117_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1802,8 +15799,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32i16:
+
define void @v_bitcast_v32bf16_to_v32i16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v32i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB118_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB118_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v32i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB118_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB118_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v32i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB118_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB118_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v32i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB118_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB118_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1818,8 +16073,266 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v64i8:
+
define void @v_bitcast_v32bf16_to_v64i8(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v32bf16_to_v64i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB119_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16
+; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16
+; GCN-NEXT: .LBB119_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32bf16_to_v64i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB119_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB119_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32bf16_to_v64i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB119_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB119_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32bf16_to_v64i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB119_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB119_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1834,8 +16347,1127 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64i8_to_v32bf16:
+
define void @v_bitcast_v64i8_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <64 x i8> %value) {
+; GCN-LABEL: v_bitcast_v64i8_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v63, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB120_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8
+; GCN-NEXT: v_or_b32_e32 v31, v0, v7
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v16
+; GCN-NEXT: v_or_b32_e32 v0, v0, v7
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: v_and_b32_e32 v7, 0xff, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, v7, v8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_and_b32_e32 v8, 0xff, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v44
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, v8, v11
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: v_and_b32_e32 v11, 0xff, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4
+; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v6
+; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v10
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v22, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v27, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v28, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v29, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v30, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v49, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0
+; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v43
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v42
+; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v17
+; GCN-NEXT: v_and_b32_e32 v51, 0xff, v62
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v61
+; GCN-NEXT: v_and_b32_e32 v55, 0xff, v60
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v59
+; GCN-NEXT: v_and_b32_e32 v40, 0xff, v58
+; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57
+; GCN-NEXT: v_and_b32_e32 v41, 0xff, v56
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v47
+; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46
+; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v47, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v56, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v58, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v59, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v61, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v62, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v63, 0xff, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v34, 0xff, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v49
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14
+; GCN-NEXT: v_or_b32_e32 v12, v51, v53
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v40
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41
+; GCN-NEXT: v_or_b32_e32 v45, v46, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v58
+; GCN-NEXT: v_or_b32_e32 v58, v59, v60
+; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61
+; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v62
+; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v63
+; GCN-NEXT: v_or_b32_e32 v0, v0, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v34
+; GCN-NEXT: v_or_b32_e32 v50, v50, v3
+; GCN-NEXT: v_or_b32_e32 v52, v52, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_or_b32_e32 v48, v48, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v18, v3, v36
+; GCN-NEXT: v_or_b32_e32 v40, v7, v37
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3
+; GCN-NEXT: v_or_b32_e32 v41, v8, v11
+; GCN-NEXT: v_or_b32_e32 v22, v6, v20
+; GCN-NEXT: v_or_b32_e32 v20, v9, v35
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3
+; GCN-NEXT: v_or_b32_e32 v53, v10, v29
+; GCN-NEXT: v_or_b32_e32 v21, v21, v30
+; GCN-NEXT: v_or_b32_e32 v19, v19, v32
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3
+; GCN-NEXT: v_or_b32_e32 v54, v54, v26
+; GCN-NEXT: v_or_b32_e32 v35, v25, v27
+; GCN-NEXT: v_or_b32_e32 v37, v15, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v12
+; GCN-NEXT: v_or_b32_e32 v25, v16, v13
+; GCN-NEXT: v_or_b32_e32 v36, v57, v14
+; GCN-NEXT: v_or_b32_e32 v38, v38, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v45
+; GCN-NEXT: v_or_b32_e32 v39, v39, v46
+; GCN-NEXT: v_or_b32_e32 v63, v44, v47
+; GCN-NEXT: v_or_b32_e32 v29, v43, v56
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v58
+; GCN-NEXT: v_or_b32_e32 v34, v42, v59
+; GCN-NEXT: v_or_b32_e32 v30, v23, v60
+; GCN-NEXT: v_or_b32_e32 v28, v24, v61
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0
+; GCN-NEXT: v_or_b32_e32 v0, v17, v62
+; GCN-NEXT: .LBB120_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v55
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v36
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v63
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16
+; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16
+; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v27, v29, 16
+; GCN-NEXT: v_alignbit_b32 v16, v31, v32, 16
+; GCN-NEXT: v_alignbit_b32 v17, v28, v30, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64i8_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136
+; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128
+; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120
+; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
+; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112
+; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104
+; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96
+; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88
+; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40
+; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32
+; VI-NEXT: v_mov_b32_e32 v31, 0
+; VI-NEXT: v_mov_b32_e32 v32, v31
+; VI-NEXT: v_mov_b32_e32 v33, v31
+; VI-NEXT: v_mov_b32_e32 v34, v31
+; VI-NEXT: v_mov_b32_e32 v35, v31
+; VI-NEXT: v_mov_b32_e32 v36, v31
+; VI-NEXT: v_mov_b32_e32 v37, v31
+; VI-NEXT: v_mov_b32_e32 v38, v31
+; VI-NEXT: v_mov_b32_e32 v48, v31
+; VI-NEXT: v_mov_b32_e32 v49, v31
+; VI-NEXT: v_mov_b32_e32 v50, v31
+; VI-NEXT: v_mov_b32_e32 v51, v31
+; VI-NEXT: v_mov_b32_e32 v52, v31
+; VI-NEXT: v_mov_b32_e32 v53, v31
+; VI-NEXT: v_mov_b32_e32 v54, v31
+; VI-NEXT: v_mov_b32_e32 v55, v31
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB120_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v6
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v8
+; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v10
+; VI-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v31, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v32, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v12
+; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v33, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v34, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v36, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v37, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v13
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v15
+; VI-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v38, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v17
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19
+; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v48, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v21
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v23
+; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v49, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v25
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v27
+; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v50, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v29
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63
+; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v61
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v59
+; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v47
+; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v43
+; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v39
+; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: .LBB120_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[52:55]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[48:51]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64i8_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136
+; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128
+; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124
+; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120
+; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112
+; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104
+; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96
+; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88
+; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64
+; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56
+; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40
+; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24
+; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32
+; GFX9-NEXT: v_mov_b32_e32 v31, 0
+; GFX9-NEXT: v_mov_b32_e32 v32, v31
+; GFX9-NEXT: v_mov_b32_e32 v33, v31
+; GFX9-NEXT: v_mov_b32_e32 v34, v31
+; GFX9-NEXT: v_mov_b32_e32 v35, v31
+; GFX9-NEXT: v_mov_b32_e32 v36, v31
+; GFX9-NEXT: v_mov_b32_e32 v37, v31
+; GFX9-NEXT: v_mov_b32_e32 v38, v31
+; GFX9-NEXT: v_mov_b32_e32 v48, v31
+; GFX9-NEXT: v_mov_b32_e32 v49, v31
+; GFX9-NEXT: v_mov_b32_e32 v50, v31
+; GFX9-NEXT: v_mov_b32_e32 v51, v31
+; GFX9-NEXT: v_mov_b32_e32 v52, v31
+; GFX9-NEXT: v_mov_b32_e32 v53, v31
+; GFX9-NEXT: v_mov_b32_e32 v54, v31
+; GFX9-NEXT: v_mov_b32_e32 v55, v31
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB120_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v6
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v8
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, 8, v10
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v31, v4, v3, s6
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-NEXT: v_perm_b32 v32, v6, v5, s6
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v12
+; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v33, v4, v3, s6
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v34, v4, v3, s6
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v35, v4, v3, s6
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v36, v4, v3, s6
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v37, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v13
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v15
+; GFX9-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v38, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v17
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19
+; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v48, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v21
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v23
+; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v49, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v25
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v27
+; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v50, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v29
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63
+; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v61
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v59
+; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v47
+; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v43
+; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v39
+; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6
+; GFX9-NEXT: .LBB120_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[52:55], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[48:51], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64i8_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:132
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v130, off, s32
+; GFX11-NEXT: v_mov_b32_e32 v31, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v32, v31
+; GFX11-NEXT: v_mov_b32_e32 v33, v31
+; GFX11-NEXT: v_mov_b32_e32 v34, v31
+; GFX11-NEXT: v_mov_b32_e32 v35, v31
+; GFX11-NEXT: v_mov_b32_e32 v36, v31
+; GFX11-NEXT: v_mov_b32_e32 v37, v31
+; GFX11-NEXT: v_mov_b32_e32 v38, v31
+; GFX11-NEXT: v_mov_b32_e32 v48, v31
+; GFX11-NEXT: v_mov_b32_e32 v49, v31
+; GFX11-NEXT: v_mov_b32_e32 v50, v31
+; GFX11-NEXT: v_mov_b32_e32 v51, v31
+; GFX11-NEXT: v_mov_b32_e32 v52, v31
+; GFX11-NEXT: v_mov_b32_e32 v53, v31
+; GFX11-NEXT: v_mov_b32_e32 v54, v31
+; GFX11-NEXT: v_mov_b32_e32 v55, v31
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB120_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v9
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v14
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v24
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v8
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v10
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v13
+; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v15
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v16
+; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v17
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v18
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v19
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v20
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v21
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v22
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v23
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v25
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v34, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v35, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v36, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v27
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v28
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v29
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v130
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v129
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v128
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v119
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v118
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v117
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v116
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v115
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v114
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v113
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v112
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v103
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v102
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v101
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v100
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v99
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v37, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v38, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v48, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v49, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v50, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v98
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v97
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v96
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v87
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v86
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v85
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v84
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v83
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v81
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v80
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v71
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v70
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v69
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v67
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v66
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v65
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v64
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v51, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v52, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v53, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v54, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v55, v11, v10, 0x5040100
+; GFX11-NEXT: .LBB120_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[52:55], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[48:51], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1850,8 +17482,346 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v32bf16:
+
define void @v_bitcast_v32i16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x i16> %value) {
+; GCN-LABEL: v_bitcast_v32i16_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_waitcnt expcnt(3)
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v60, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v61, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB121_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v28
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v62
+; GCN-NEXT: .LBB121_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v46
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v60
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v61
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v57
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v42
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v43
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v44
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v45
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v50
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v36
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_alignbit_b32 v5, v6, v7, 16
+; GCN-NEXT: v_alignbit_b32 v6, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v7, v10, v11, 16
+; GCN-NEXT: v_alignbit_b32 v8, v12, v13, 16
+; GCN-NEXT: v_alignbit_b32 v9, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v10, v16, v17, 16
+; GCN-NEXT: v_alignbit_b32 v11, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v12, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v13, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v14, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v15, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v16, v28, v30, 16
+; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16
+; GCN-NEXT: v_alignbit_b32 v18, v29, v33, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32i16_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB121_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB121_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32i16_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB121_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB121_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32i16_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB121_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB121_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1866,8 +17836,375 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v32bf16:
+
define void @v_bitcast_v32f16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x half> %value) {
+; GCN-LABEL: v_bitcast_v32f16_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v60, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v61, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB122_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v33, v31
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62
+; GCN-NEXT: v_cvt_f16_f32_e32 v62, v63
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v62
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
+; GCN-NEXT: .LBB122_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v60
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v56
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v61
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v57
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v42
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v43
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v44
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v45
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v50
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v31
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v36
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16
+; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16
+; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v27, v28, 16
+; GCN-NEXT: v_alignbit_b32 v16, v29, v30, 16
+; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32f16_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB122_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB122_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32f16_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB122_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB122_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32f16_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB122_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB122_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1882,8 +18219,304 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16i32_to_v32bf16:
+
define void @v_bitcast_v16i32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x i32> %value) {
+; GCN-LABEL: v_bitcast_v16i32_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB123_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
+; GCN-NEXT: .LBB123_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
+; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
+; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
+; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
+; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16i32_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB123_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB123_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16i32_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB123_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB123_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16i32_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB123_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB123_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1898,8 +18531,304 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v16f32_to_v32bf16:
+
define void @v_bitcast_v16f32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x float> %value) {
+; GCN-LABEL: v_bitcast_v16f32_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB124_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
+; GCN-NEXT: .LBB124_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
+; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
+; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
+; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
+; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v16f32_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB124_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB124_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v16f32_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB124_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB124_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v16f32_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB124_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB124_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1914,8 +18843,304 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32bf16:
+
define void @v_bitcast_v8f64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x double> %value) {
+; GCN-LABEL: v_bitcast_v8f64_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB125_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
+; GCN-NEXT: .LBB125_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
+; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
+; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
+; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
+; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8f64_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB125_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB125_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8f64_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB125_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB125_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8f64_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB125_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB125_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1930,8 +19155,304 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32bf16:
+
define void @v_bitcast_v8i64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x i64> %value) {
+; GCN-LABEL: v_bitcast_v8i64_to_v32bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v29, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB126_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3
+; GCN-NEXT: .LBB126_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34
+; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
+; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16
+; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16
+; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16
+; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16
+; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16
+; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16
+; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16
+; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16
+; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v8i64_to_v32bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v19, 0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v20, v19
+; VI-NEXT: v_mov_b32_e32 v21, v19
+; VI-NEXT: v_mov_b32_e32 v22, v19
+; VI-NEXT: v_mov_b32_e32 v23, v19
+; VI-NEXT: v_mov_b32_e32 v24, v19
+; VI-NEXT: v_mov_b32_e32 v25, v19
+; VI-NEXT: v_mov_b32_e32 v26, v19
+; VI-NEXT: v_mov_b32_e32 v27, v19
+; VI-NEXT: v_mov_b32_e32 v28, v19
+; VI-NEXT: v_mov_b32_e32 v29, v19
+; VI-NEXT: v_mov_b32_e32 v30, v19
+; VI-NEXT: v_mov_b32_e32 v31, v19
+; VI-NEXT: v_mov_b32_e32 v32, v19
+; VI-NEXT: v_mov_b32_e32 v33, v19
+; VI-NEXT: v_mov_b32_e32 v34, v19
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB126_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v34, v18
+; VI-NEXT: v_mov_b32_e32 v33, v17
+; VI-NEXT: v_mov_b32_e32 v32, v16
+; VI-NEXT: v_mov_b32_e32 v31, v15
+; VI-NEXT: v_mov_b32_e32 v30, v14
+; VI-NEXT: v_mov_b32_e32 v29, v13
+; VI-NEXT: v_mov_b32_e32 v28, v12
+; VI-NEXT: v_mov_b32_e32 v27, v11
+; VI-NEXT: v_mov_b32_e32 v26, v10
+; VI-NEXT: v_mov_b32_e32 v25, v9
+; VI-NEXT: v_mov_b32_e32 v24, v8
+; VI-NEXT: v_mov_b32_e32 v23, v7
+; VI-NEXT: v_mov_b32_e32 v22, v6
+; VI-NEXT: v_mov_b32_e32 v21, v5
+; VI-NEXT: v_mov_b32_e32 v20, v4
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: .LBB126_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v8i64_to_v32bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v19, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v20, v19
+; GFX9-NEXT: v_mov_b32_e32 v21, v19
+; GFX9-NEXT: v_mov_b32_e32 v22, v19
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mov_b32_e32 v24, v19
+; GFX9-NEXT: v_mov_b32_e32 v25, v19
+; GFX9-NEXT: v_mov_b32_e32 v26, v19
+; GFX9-NEXT: v_mov_b32_e32 v27, v19
+; GFX9-NEXT: v_mov_b32_e32 v28, v19
+; GFX9-NEXT: v_mov_b32_e32 v29, v19
+; GFX9-NEXT: v_mov_b32_e32 v30, v19
+; GFX9-NEXT: v_mov_b32_e32 v31, v19
+; GFX9-NEXT: v_mov_b32_e32 v32, v19
+; GFX9-NEXT: v_mov_b32_e32 v33, v19
+; GFX9-NEXT: v_mov_b32_e32 v34, v19
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB126_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v34, v18
+; GFX9-NEXT: v_mov_b32_e32 v33, v17
+; GFX9-NEXT: v_mov_b32_e32 v32, v16
+; GFX9-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: .LBB126_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v8i64_to_v32bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v19, 0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v20, v19
+; GFX11-NEXT: v_mov_b32_e32 v21, v19
+; GFX11-NEXT: v_mov_b32_e32 v22, v19
+; GFX11-NEXT: v_mov_b32_e32 v23, v19
+; GFX11-NEXT: v_mov_b32_e32 v24, v19
+; GFX11-NEXT: v_mov_b32_e32 v25, v19
+; GFX11-NEXT: v_mov_b32_e32 v26, v19
+; GFX11-NEXT: v_mov_b32_e32 v27, v19
+; GFX11-NEXT: v_mov_b32_e32 v28, v19
+; GFX11-NEXT: v_mov_b32_e32 v29, v19
+; GFX11-NEXT: v_mov_b32_e32 v30, v19
+; GFX11-NEXT: v_mov_b32_e32 v31, v19
+; GFX11-NEXT: v_mov_b32_e32 v32, v19
+; GFX11-NEXT: v_mov_b32_e32 v33, v19
+; GFX11-NEXT: v_mov_b32_e32 v34, v19
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB126_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17
+; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15
+; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13
+; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11
+; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9
+; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7
+; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5
+; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3
+; GFX11-NEXT: .LBB126_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1953,8 +19474,943 @@ end:
-; CHECK-LABEL: {{^}}v_bitcast_v32f32_to_v64bf16:
+
define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x float> %value) {
+; GCN-LABEL: v_bitcast_v32f32_to_v64bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB127_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8
+; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7
+; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6
+; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5
+; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
+; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
+; GCN-NEXT: .LBB127_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32f32_to_v64bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB127_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB127_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32f32_to_v64bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB127_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB127_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32f32_to_v64bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB127_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB127_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1969,8 +20425,943 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v32i32_to_v64bf16:
+
define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x i32> %value) {
+; GCN-LABEL: v_bitcast_v32i32_to_v64bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v35, 0
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB128_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15
+; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14
+; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13
+; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12
+; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10
+; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9
+; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8
+; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7
+; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6
+; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5
+; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4
+; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3
+; GCN-NEXT: .LBB128_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v32i32_to_v64bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB128_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB128_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v32i32_to_v64bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB128_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB128_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v32i32_to_v64bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB128_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB128_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -1985,8 +21376,1135 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64i16_to_v64bf16:
+
define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x i16> %value) {
+; GCN-LABEL: v_bitcast_v64i16_to_v64bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt expcnt(3)
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB129_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v13
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v16
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v17
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v18
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v22
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v23
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v55
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v54
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v26
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v27
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v51
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v50
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v48
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v39
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v38
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v36
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v32
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v63
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: .LBB129_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64i16_to_v64bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB129_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB129_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64i16_to_v64bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB129_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB129_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64i16_to_v64bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB129_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB129_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2001,8 +22519,1199 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64f16_to_v64bf16:
+
define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x half> %value) {
+; GCN-LABEL: v_bitcast_v64f16_to_v64bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v29, v16
+; GCN-NEXT: v_mov_b32_e32 v16, v15
+; GCN-NEXT: v_mov_b32_e32 v15, v14
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v59, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v34, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB130_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v59, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v56, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v58, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v45, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v47, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v44, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v46, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v41, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v43, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v13
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v51, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50
+; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61
+; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39
+; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38
+; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35
+; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v53, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v40, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57
+; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59
+; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56
+; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58
+; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47
+; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v17
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v18
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v33
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v29
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v22
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v50
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v49
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v23
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v48
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v25
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v38
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v37
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v26
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v53
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v40
+; GCN-NEXT: v_mov_b32_e32 v40, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42
+; GCN-NEXT: v_mov_b32_e32 v42, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v0
+; GCN-NEXT: .LBB130_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v40
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v53
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v51
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v34
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v33
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64f16_to_v64bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB130_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB130_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64f16_to_v64bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB130_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB130_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64f16_to_v64bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB130_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB130_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2017,8 +23726,3148 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v128i8_to_v64bf16:
+
define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128 x i8> %value) {
+; GCN-LABEL: v_bitcast_v128i8_to_v64bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:224
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v13, 0
+; GCN-NEXT: v_mov_b32_e32 v14, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v62, 0
+; GCN-NEXT: v_mov_b32_e32 v33, 0
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v27, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v9, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v23, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: v_mov_b32_e32 v28, 0
+; GCN-NEXT: v_mov_b32_e32 v24, 0
+; GCN-NEXT: v_mov_b32_e32 v6, 0
+; GCN-NEXT: v_mov_b32_e32 v26, 0
+; GCN-NEXT: v_mov_b32_e32 v61, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v11, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v60, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v19, 0
+; GCN-NEXT: v_mov_b32_e32 v25, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v20, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v17, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v21, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB131_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8
+; GCN-NEXT: v_or_b32_e32 v7, v0, v3
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GCN-NEXT: v_or_b32_e32 v8, v0, v3
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; GCN-NEXT: v_or_b32_e32 v24, v0, v3
+; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39
+; GCN-NEXT: v_or_b32_e32 v0, v0, v3
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v38
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v29
+; GCN-NEXT: v_or_b32_e32 v23, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v37
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v35
+; GCN-NEXT: v_or_b32_e32 v17, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v15
+; GCN-NEXT: v_or_b32_e32 v18, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v63
+; GCN-NEXT: v_or_b32_e32 v21, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v59
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v58
+; GCN-NEXT: v_or_b32_e32 v25, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v57
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v42
+; GCN-NEXT: v_or_b32_e32 v30, v3, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55
+; GCN-NEXT: v_or_b32_e32 v35, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GCN-NEXT: v_or_b32_e32 v55, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GCN-NEXT: v_or_b32_e32 v42, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GCN-NEXT: v_or_b32_e32 v45, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GCN-NEXT: v_or_b32_e32 v32, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GCN-NEXT: v_or_b32_e32 v57, v3, v4
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v14, v4, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v12, v4, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v22, v4, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v4, v4, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v56, v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v33, v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v10, v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v27, v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v5, v5, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v52, v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v9, v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v40, v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v36, v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v49, v6, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_or_b32_e32 v3, v6, v3
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_or_b32_e32 v31, v11, v6
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_or_b32_e32 v28, v11, v6
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_or_b32_e32 v6, v11, v6
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_or_b32_e32 v26, v13, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_or_b32_e32 v61, v13, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_or_b32_e32 v11, v13, v11
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v41, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v60, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v44, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v19, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v54, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v20, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v51, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v50, v15, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_or_b32_e32 v13, v15, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v35
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v42
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v32
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v57
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; GCN-NEXT: .LBB131_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
+; GCN-NEXT: v_alignbit_b32 v12, v7, v8, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v16
+; GCN-NEXT: v_mov_b32_e32 v16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v7, v8, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v14, v7, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v62
+; GCN-NEXT: v_alignbit_b32 v15, v4, v7, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
+; GCN-NEXT: v_alignbit_b32 v7, v4, v7, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47
+; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52
+; GCN-NEXT: v_alignbit_b32 v9, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43
+; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v36
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_alignbit_b32 v7, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23
+; GCN-NEXT: v_alignbit_b32 v8, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v31
+; GCN-NEXT: v_alignbit_b32 v9, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24
+; GCN-NEXT: v_alignbit_b32 v10, v3, v4, 16
+; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v61
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v26
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v53
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v60
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v41
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_alignbit_b32 v6, v6, v0, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v18
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v30
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v50
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v128i8_to_v64bf16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v46, s19
+; VI-NEXT: v_mov_b32_e32 v45, s18
+; VI-NEXT: v_mov_b32_e32 v44, s17
+; VI-NEXT: v_mov_b32_e32 v43, s16
+; VI-NEXT: v_mov_b32_e32 v42, s15
+; VI-NEXT: v_mov_b32_e32 v41, s14
+; VI-NEXT: v_mov_b32_e32 v40, s13
+; VI-NEXT: v_mov_b32_e32 v39, s12
+; VI-NEXT: v_mov_b32_e32 v38, s11
+; VI-NEXT: v_mov_b32_e32 v37, s10
+; VI-NEXT: v_mov_b32_e32 v36, s9
+; VI-NEXT: v_mov_b32_e32 v35, s8
+; VI-NEXT: v_mov_b32_e32 v34, s7
+; VI-NEXT: v_mov_b32_e32 v33, s6
+; VI-NEXT: v_mov_b32_e32 v32, s5
+; VI-NEXT: v_mov_b32_e32 v31, s4
+; VI-NEXT: v_mov_b32_e32 v62, v46
+; VI-NEXT: v_mov_b32_e32 v61, v45
+; VI-NEXT: v_mov_b32_e32 v60, v44
+; VI-NEXT: v_mov_b32_e32 v59, v43
+; VI-NEXT: v_mov_b32_e32 v58, v42
+; VI-NEXT: v_mov_b32_e32 v57, v41
+; VI-NEXT: v_mov_b32_e32 v56, v40
+; VI-NEXT: v_mov_b32_e32 v55, v39
+; VI-NEXT: v_mov_b32_e32 v54, v38
+; VI-NEXT: v_mov_b32_e32 v53, v37
+; VI-NEXT: v_mov_b32_e32 v52, v36
+; VI-NEXT: v_mov_b32_e32 v51, v35
+; VI-NEXT: v_mov_b32_e32 v50, v34
+; VI-NEXT: v_mov_b32_e32 v49, v33
+; VI-NEXT: v_mov_b32_e32 v48, v32
+; VI-NEXT: v_mov_b32_e32 v47, v31
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116
+; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112
+; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108
+; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104
+; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96
+; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92
+; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88
+; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84
+; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80
+; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76
+; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72
+; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68
+; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64
+; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60
+; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56
+; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52
+; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48
+; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40
+; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36
+; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32
+; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28
+; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24
+; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB131_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28
+; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(8)
+; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31
+; VI-NEXT: s_waitcnt vmcnt(7)
+; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32
+; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v32, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v33, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v34, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36
+; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v35, v35, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v38
+; VI-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v37, v37, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v38, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v30
+; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v24
+; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v39, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v27
+; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v23
+; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v40, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29
+; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v26
+; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v41, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v17
+; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v25
+; VI-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v42, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v20
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v43, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7
+; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63
+; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v44, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22
+; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v45, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v46, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT: v_or_b32_sdwa v47, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v48, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v49, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v50, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v56, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v57, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v58, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v59, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v60, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v61, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v62, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: .LBB131_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[59:62]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[55:58]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[51:54]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[47:50]
+; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v128i8_to_v64bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v46, s19
+; GFX9-NEXT: v_mov_b32_e32 v45, s18
+; GFX9-NEXT: v_mov_b32_e32 v44, s17
+; GFX9-NEXT: v_mov_b32_e32 v43, s16
+; GFX9-NEXT: v_mov_b32_e32 v42, s15
+; GFX9-NEXT: v_mov_b32_e32 v41, s14
+; GFX9-NEXT: v_mov_b32_e32 v40, s13
+; GFX9-NEXT: v_mov_b32_e32 v39, s12
+; GFX9-NEXT: v_mov_b32_e32 v38, s11
+; GFX9-NEXT: v_mov_b32_e32 v37, s10
+; GFX9-NEXT: v_mov_b32_e32 v36, s9
+; GFX9-NEXT: v_mov_b32_e32 v35, s8
+; GFX9-NEXT: v_mov_b32_e32 v34, s7
+; GFX9-NEXT: v_mov_b32_e32 v33, s6
+; GFX9-NEXT: v_mov_b32_e32 v32, s5
+; GFX9-NEXT: v_mov_b32_e32 v31, s4
+; GFX9-NEXT: v_mov_b32_e32 v62, v46
+; GFX9-NEXT: v_mov_b32_e32 v61, v45
+; GFX9-NEXT: v_mov_b32_e32 v60, v44
+; GFX9-NEXT: v_mov_b32_e32 v59, v43
+; GFX9-NEXT: v_mov_b32_e32 v58, v42
+; GFX9-NEXT: v_mov_b32_e32 v57, v41
+; GFX9-NEXT: v_mov_b32_e32 v56, v40
+; GFX9-NEXT: v_mov_b32_e32 v55, v39
+; GFX9-NEXT: v_mov_b32_e32 v54, v38
+; GFX9-NEXT: v_mov_b32_e32 v53, v37
+; GFX9-NEXT: v_mov_b32_e32 v52, v36
+; GFX9-NEXT: v_mov_b32_e32 v51, v35
+; GFX9-NEXT: v_mov_b32_e32 v50, v34
+; GFX9-NEXT: v_mov_b32_e32 v49, v33
+; GFX9-NEXT: v_mov_b32_e32 v48, v32
+; GFX9-NEXT: v_mov_b32_e32 v47, v31
+; GFX9-NEXT: s_waitcnt vmcnt(44)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116
+; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112
+; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108
+; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104
+; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96
+; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92
+; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88
+; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84
+; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80
+; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76
+; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72
+; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68
+; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64
+; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60
+; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56
+; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52
+; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48
+; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40
+; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36
+; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32
+; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28
+; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24
+; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB131_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b32 s6, 0x5040100
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v28
+; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31
+; GFX9-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32
+; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v32, v34, v33, s6
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v33, v34, v33, s6
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v34, v35, v34, s6
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
+; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v35, v36, v35, s6
+; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v36, v37, v36, s6
+; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38
+; GFX9-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v37, v38, v37, s6
+; GFX9-NEXT: v_perm_b32 v38, v11, v12, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v30
+; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v24
+; GFX9-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v39, v12, v11, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v27
+; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v23
+; GFX9-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v40, v12, v11, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29
+; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v26
+; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v41, v4, v11, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v17
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v25
+; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v42, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v20
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19
+; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v43, v4, v3, s6
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63
+; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v44, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22
+; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v45, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v46, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; GFX9-NEXT: v_perm_b32 v47, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v48, v5, v4, s6
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v49, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v50, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v56, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v57, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v58, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v59, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v60, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v61, v3, v0, s6
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_perm_b32 v62, v3, v0, s6
+; GFX9-NEXT: .LBB131_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[59:62], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[55:58], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[51:54], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:64
+; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v128i8_to_v64bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:600
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:596
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:592
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:588
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:584
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:580
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:576
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:572
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:568
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:564
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:560
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:556
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:552
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:548
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:544
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:540
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:536
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:532
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:528
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:524
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:520
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:516
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:512
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:508
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:504
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:500
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:496
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:492
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:488
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:484
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:480
+; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:476
+; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:472
+; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:468
+; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:464
+; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:460
+; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:456
+; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:452
+; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:448
+; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:444
+; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:400
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:396
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:392
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:388
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:384
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:380
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:376
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:372
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:368
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:364
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:360
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:356
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:352
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:348
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:344
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:340
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:336
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:332
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:328
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:324
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:320
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:272
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:240
+; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:144
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:132
+; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v120, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v121, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v122, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v123, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v124, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v125, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v126, off, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_u16 v127, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v138, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v46, s15 :: v_dual_mov_b32 v45, s14
+; GFX11-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s12
+; GFX11-NEXT: v_dual_mov_b32 v42, s11 :: v_dual_mov_b32 v41, s10
+; GFX11-NEXT: v_dual_mov_b32 v40, s9 :: v_dual_mov_b32 v39, s8
+; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6
+; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4
+; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2
+; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0
+; GFX11-NEXT: v_dual_mov_b32 v62, v46 :: v_dual_mov_b32 v61, v45
+; GFX11-NEXT: v_dual_mov_b32 v60, v44 :: v_dual_mov_b32 v59, v43
+; GFX11-NEXT: v_dual_mov_b32 v58, v42 :: v_dual_mov_b32 v57, v41
+; GFX11-NEXT: v_dual_mov_b32 v56, v40 :: v_dual_mov_b32 v55, v39
+; GFX11-NEXT: v_dual_mov_b32 v54, v38 :: v_dual_mov_b32 v53, v37
+; GFX11-NEXT: v_dual_mov_b32 v52, v36 :: v_dual_mov_b32 v51, v35
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB131_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v10
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v15
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v16
+; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12
+; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v13
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v14
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v17
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v18
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v19
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v20
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v21
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v22
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v23
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v24
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v25
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v26
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v27
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v28
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v29
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v30
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v34, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v35, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v36, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v37, v11, v10, 0x5040100
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v138
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v137
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v136
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v127
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v126
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v125
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v124
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v123
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v122
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v121
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v120
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v111
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v110
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v109
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v108
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v107
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v106
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v105
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v104
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v95
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v38, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v39, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v40, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v41, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v42, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v94
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v93
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v92
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v91
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v90
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v89
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v88
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v79
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v77
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v76
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v75
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v74
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v73
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v72
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v63
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v183
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v182
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v181
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v180
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v43, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v44, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v45, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v46, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v47, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v178
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v177
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v176
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v167
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v166
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v165
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v164
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v163
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v162
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v161
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v160
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v151
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v150
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v149
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v148
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v147
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v146
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v145
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v144
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v48, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v49, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v50, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v51, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v52, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v135
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v134
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v133
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v132
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v131
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v130
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v129
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v128
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v119
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v118
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v117
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v116
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v115
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v114
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v113
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v112
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v103
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v102
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v100
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v53, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v54, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v55, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v56, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v57, v11, v10, 0x5040100
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v99
+; GFX11-NEXT: v_lshlrev_b16 v3, 8, v98
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v97
+; GFX11-NEXT: v_lshlrev_b16 v5, 8, v96
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v87
+; GFX11-NEXT: v_lshlrev_b16 v7, 8, v86
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v85
+; GFX11-NEXT: v_lshlrev_b16 v9, 8, v84
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v83
+; GFX11-NEXT: v_lshlrev_b16 v11, 8, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v8, v9
+; GFX11-NEXT: v_or_b32_e32 v6, v10, v11
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v81
+; GFX11-NEXT: v_lshlrev_b16 v8, 8, v80
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v71
+; GFX11-NEXT: v_lshlrev_b16 v10, 8, v70
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v69
+; GFX11-NEXT: v_lshlrev_b16 v12, 8, v68
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v67
+; GFX11-NEXT: v_lshlrev_b16 v14, 8, v66
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v65
+; GFX11-NEXT: v_lshlrev_b16 v16, 8, v64
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX11-NEXT: v_or_b32_e32 v8, v9, v10
+; GFX11-NEXT: v_or_b32_e32 v9, v11, v12
+; GFX11-NEXT: v_or_b32_e32 v10, v13, v14
+; GFX11-NEXT: v_or_b32_e32 v11, v15, v16
+; GFX11-NEXT: v_perm_b32 v58, v3, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v59, v5, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v60, v7, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v61, v9, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v62, v11, v10, 0x5040100
+; GFX11-NEXT: .LBB131_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:64
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:440
+; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:444
+; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:448
+; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:452
+; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:456
+; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:460
+; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:464
+; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:468
+; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:472
+; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:476
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:480
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:484
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:488
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:492
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:496
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:500
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:504
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:508
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:512
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:516
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:520
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:524
+; GFX11-NEXT: s_clause 0x12
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:528
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:532
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:536
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:540
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:544
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:548
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:552
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:556
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:560
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:564
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:568
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:572
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:576
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:580
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:584
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:588
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:592
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:596
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:600
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2033,8 +26882,814 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64i16:
+
define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v64bf16_to_v64i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: v_mov_b32_e32 v39, v31
+; GCN-NEXT: v_mov_b32_e32 v40, v31
+; GCN-NEXT: v_mov_b32_e32 v41, v31
+; GCN-NEXT: v_mov_b32_e32 v42, v31
+; GCN-NEXT: v_mov_b32_e32 v43, v31
+; GCN-NEXT: v_mov_b32_e32 v44, v31
+; GCN-NEXT: v_mov_b32_e32 v45, v31
+; GCN-NEXT: v_mov_b32_e32 v46, v31
+; GCN-NEXT: v_mov_b32_e32 v56, v31
+; GCN-NEXT: v_mov_b32_e32 v57, v31
+; GCN-NEXT: v_mov_b32_e32 v58, v31
+; GCN-NEXT: v_mov_b32_e32 v59, v31
+; GCN-NEXT: v_mov_b32_e32 v60, v31
+; GCN-NEXT: v_mov_b32_e32 v61, v31
+; GCN-NEXT: v_mov_b32_e32 v62, v31
+; GCN-NEXT: v_mov_b32_e32 v63, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB132_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
+; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
+; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
+; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
+; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
+; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
+; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
+; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
+; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
+; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
+; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
+; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
+; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
+; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
+; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
+; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
+; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
+; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
+; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
+; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
+; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
+; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
+; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
+; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
+; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
+; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
+; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
+; GCN-NEXT: .LBB132_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64bf16_to_v64i16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB132_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB132_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64bf16_to_v64i16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB132_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB132_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64bf16_to_v64i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB132_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB132_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2049,8 +27704,1414 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64f16:
+
define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v64bf16_to_v64f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v30, 0
+; GCN-NEXT: v_mov_b32_e32 v58, 0
+; GCN-NEXT: v_mov_b32_e32 v56, 0
+; GCN-NEXT: v_mov_b32_e32 v57, 0
+; GCN-NEXT: v_mov_b32_e32 v45, 0
+; GCN-NEXT: v_mov_b32_e32 v47, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
+; GCN-NEXT: v_mov_b32_e32 v46, 0
+; GCN-NEXT: v_mov_b32_e32 v41, 0
+; GCN-NEXT: v_mov_b32_e32 v43, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
+; GCN-NEXT: v_mov_b32_e32 v42, 0
+; GCN-NEXT: v_mov_b32_e32 v54, 0
+; GCN-NEXT: v_mov_b32_e32 v55, 0
+; GCN-NEXT: v_mov_b32_e32 v52, 0
+; GCN-NEXT: v_mov_b32_e32 v53, 0
+; GCN-NEXT: v_mov_b32_e32 v49, 0
+; GCN-NEXT: v_mov_b32_e32 v51, 0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: v_mov_b32_e32 v50, 0
+; GCN-NEXT: v_mov_b32_e32 v37, 0
+; GCN-NEXT: v_mov_b32_e32 v39, 0
+; GCN-NEXT: v_mov_b32_e32 v36, 0
+; GCN-NEXT: v_mov_b32_e32 v38, 0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v22, 0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB133_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61
+; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v33
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v32
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v31
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v63
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v60
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v62
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v59
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v35
+; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v61
+; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v52
+; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v53
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v54
+; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v55
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v40
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v41
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v43
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v44
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v46
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v47
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v56
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v57
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v58
+; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v58, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v56, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v45, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v47, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v44, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v46, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v41, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v43, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v40, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15
+; GCN-NEXT: v_cvt_f32_f16_e32 v55, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17
+; GCN-NEXT: v_cvt_f32_f16_e32 v53, v18
+; GCN-NEXT: v_cvt_f32_f16_e32 v49, v19
+; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20
+; GCN-NEXT: v_cvt_f32_f16_e32 v48, v21
+; GCN-NEXT: v_cvt_f32_f16_e32 v50, v22
+; GCN-NEXT: v_cvt_f32_f16_e32 v37, v23
+; GCN-NEXT: v_cvt_f32_f16_e32 v39, v24
+; GCN-NEXT: v_cvt_f32_f16_e32 v36, v25
+; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: .LBB133_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v58
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v57
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v56
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v47
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v42
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v40
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v49
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v48
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v37
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v36
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_or_b32_e32 v3, v3, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_or_b32_e32 v4, v4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v22
+; GCN-NEXT: v_or_b32_e32 v5, v5, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v0
+; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64bf16_to_v64f16:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_mov_b32 s19, s4
+; VI-NEXT: s_mov_b32 s5, s4
+; VI-NEXT: s_mov_b32 s6, s4
+; VI-NEXT: s_mov_b32 s7, s4
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s4
+; VI-NEXT: s_mov_b32 s10, s4
+; VI-NEXT: s_mov_b32 s11, s4
+; VI-NEXT: s_mov_b32 s12, s4
+; VI-NEXT: s_mov_b32 s13, s4
+; VI-NEXT: s_mov_b32 s14, s4
+; VI-NEXT: s_mov_b32 s15, s4
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s4
+; VI-NEXT: s_mov_b32 s18, s4
+; VI-NEXT: v_mov_b32_e32 v50, s19
+; VI-NEXT: v_mov_b32_e32 v49, s18
+; VI-NEXT: v_mov_b32_e32 v48, s17
+; VI-NEXT: v_mov_b32_e32 v47, s16
+; VI-NEXT: v_mov_b32_e32 v46, s15
+; VI-NEXT: v_mov_b32_e32 v45, s14
+; VI-NEXT: v_mov_b32_e32 v44, s13
+; VI-NEXT: v_mov_b32_e32 v43, s12
+; VI-NEXT: v_mov_b32_e32 v42, s11
+; VI-NEXT: v_mov_b32_e32 v41, s10
+; VI-NEXT: v_mov_b32_e32 v40, s9
+; VI-NEXT: v_mov_b32_e32 v39, s8
+; VI-NEXT: v_mov_b32_e32 v38, s7
+; VI-NEXT: v_mov_b32_e32 v37, s6
+; VI-NEXT: v_mov_b32_e32 v36, s5
+; VI-NEXT: v_mov_b32_e32 v35, s4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB133_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: v_mov_b32_e32 v50, v18
+; VI-NEXT: v_mov_b32_e32 v49, v17
+; VI-NEXT: v_mov_b32_e32 v48, v16
+; VI-NEXT: v_mov_b32_e32 v47, v15
+; VI-NEXT: v_mov_b32_e32 v46, v14
+; VI-NEXT: v_mov_b32_e32 v45, v13
+; VI-NEXT: v_mov_b32_e32 v44, v12
+; VI-NEXT: v_mov_b32_e32 v43, v11
+; VI-NEXT: v_mov_b32_e32 v42, v10
+; VI-NEXT: v_mov_b32_e32 v41, v9
+; VI-NEXT: v_mov_b32_e32 v40, v8
+; VI-NEXT: v_mov_b32_e32 v39, v7
+; VI-NEXT: v_mov_b32_e32 v38, v6
+; VI-NEXT: v_mov_b32_e32 v37, v5
+; VI-NEXT: v_mov_b32_e32 v36, v4
+; VI-NEXT: v_mov_b32_e32 v35, v3
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: .LBB133_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46]
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38]
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: s_movk_i32 s4, 0x70
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x60
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: s_movk_i32 s4, 0x50
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12]
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64bf16_to_v64f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_mov_b32 s4, 0
+; GFX9-NEXT: s_mov_b32 s19, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s7, s4
+; GFX9-NEXT: s_mov_b32 s8, s4
+; GFX9-NEXT: s_mov_b32 s9, s4
+; GFX9-NEXT: s_mov_b32 s10, s4
+; GFX9-NEXT: s_mov_b32 s11, s4
+; GFX9-NEXT: s_mov_b32 s12, s4
+; GFX9-NEXT: s_mov_b32 s13, s4
+; GFX9-NEXT: s_mov_b32 s14, s4
+; GFX9-NEXT: s_mov_b32 s15, s4
+; GFX9-NEXT: s_mov_b32 s16, s4
+; GFX9-NEXT: s_mov_b32 s17, s4
+; GFX9-NEXT: s_mov_b32 s18, s4
+; GFX9-NEXT: v_mov_b32_e32 v50, s19
+; GFX9-NEXT: v_mov_b32_e32 v49, s18
+; GFX9-NEXT: v_mov_b32_e32 v48, s17
+; GFX9-NEXT: v_mov_b32_e32 v47, s16
+; GFX9-NEXT: v_mov_b32_e32 v46, s15
+; GFX9-NEXT: v_mov_b32_e32 v45, s14
+; GFX9-NEXT: v_mov_b32_e32 v44, s13
+; GFX9-NEXT: v_mov_b32_e32 v43, s12
+; GFX9-NEXT: v_mov_b32_e32 v42, s11
+; GFX9-NEXT: v_mov_b32_e32 v41, s10
+; GFX9-NEXT: v_mov_b32_e32 v40, s9
+; GFX9-NEXT: v_mov_b32_e32 v39, s8
+; GFX9-NEXT: v_mov_b32_e32 v38, s7
+; GFX9-NEXT: v_mov_b32_e32 v37, s6
+; GFX9-NEXT: v_mov_b32_e32 v36, s5
+; GFX9-NEXT: v_mov_b32_e32 v35, s4
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB133_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: v_mov_b32_e32 v50, v18
+; GFX9-NEXT: v_mov_b32_e32 v49, v17
+; GFX9-NEXT: v_mov_b32_e32 v48, v16
+; GFX9-NEXT: v_mov_b32_e32 v47, v15
+; GFX9-NEXT: v_mov_b32_e32 v46, v14
+; GFX9-NEXT: v_mov_b32_e32 v45, v13
+; GFX9-NEXT: v_mov_b32_e32 v44, v12
+; GFX9-NEXT: v_mov_b32_e32 v43, v11
+; GFX9-NEXT: v_mov_b32_e32 v42, v10
+; GFX9-NEXT: v_mov_b32_e32 v41, v9
+; GFX9-NEXT: v_mov_b32_e32 v40, v8
+; GFX9-NEXT: v_mov_b32_e32 v39, v7
+; GFX9-NEXT: v_mov_b32_e32 v38, v6
+; GFX9-NEXT: v_mov_b32_e32 v37, v5
+; GFX9-NEXT: v_mov_b32_e32 v36, v4
+; GFX9-NEXT: v_mov_b32_e32 v35, v3
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: .LBB133_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64bf16_to_v64f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s15, s0
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s0
+; GFX11-NEXT: s_mov_b32 s6, s0
+; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s0
+; GFX11-NEXT: s_mov_b32 s10, s0
+; GFX11-NEXT: s_mov_b32 s11, s0
+; GFX11-NEXT: s_mov_b32 s12, s0
+; GFX11-NEXT: s_mov_b32 s13, s0
+; GFX11-NEXT: s_mov_b32 s14, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14
+; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0
+; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12
+; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10
+; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8
+; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6
+; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4
+; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2
+; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56
+; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54
+; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58
+; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60
+; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62
+; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64
+; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT: s_cbranch_execz .LBB133_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15
+; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13
+; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11
+; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9
+; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7
+; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5
+; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3
+; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31
+; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29
+; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27
+; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25
+; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23
+; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21
+; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19
+; GFX11-NEXT: .LBB133_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2065,8 +29126,782 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v128i8:
+
define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v64bf16_to_v128i8:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: v_mov_b32_e32 v39, v31
+; GCN-NEXT: v_mov_b32_e32 v40, v31
+; GCN-NEXT: v_mov_b32_e32 v41, v31
+; GCN-NEXT: v_mov_b32_e32 v42, v31
+; GCN-NEXT: v_mov_b32_e32 v43, v31
+; GCN-NEXT: v_mov_b32_e32 v44, v31
+; GCN-NEXT: v_mov_b32_e32 v45, v31
+; GCN-NEXT: v_mov_b32_e32 v46, v31
+; GCN-NEXT: v_mov_b32_e32 v56, v31
+; GCN-NEXT: v_mov_b32_e32 v57, v31
+; GCN-NEXT: v_mov_b32_e32 v58, v31
+; GCN-NEXT: v_mov_b32_e32 v59, v31
+; GCN-NEXT: v_mov_b32_e32 v60, v31
+; GCN-NEXT: v_mov_b32_e32 v61, v31
+; GCN-NEXT: v_mov_b32_e32 v62, v31
+; GCN-NEXT: v_mov_b32_e32 v63, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB134_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
+; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
+; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
+; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
+; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
+; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
+; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
+; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
+; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
+; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
+; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
+; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
+; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
+; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
+; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
+; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
+; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
+; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
+; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
+; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
+; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
+; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
+; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
+; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
+; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
+; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
+; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
+; GCN-NEXT: .LBB134_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64bf16_to_v128i8:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: v_mov_b32_e32 v8, v3
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v10, v3
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: v_mov_b32_e32 v12, v3
+; VI-NEXT: v_mov_b32_e32 v13, v3
+; VI-NEXT: v_mov_b32_e32 v14, v3
+; VI-NEXT: v_mov_b32_e32 v15, v3
+; VI-NEXT: v_mov_b32_e32 v16, v3
+; VI-NEXT: v_mov_b32_e32 v17, v3
+; VI-NEXT: v_mov_b32_e32 v18, v3
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: v_mov_b32_e32 v20, v3
+; VI-NEXT: v_mov_b32_e32 v21, v3
+; VI-NEXT: v_mov_b32_e32 v22, v3
+; VI-NEXT: v_mov_b32_e32 v23, v3
+; VI-NEXT: v_mov_b32_e32 v24, v3
+; VI-NEXT: v_mov_b32_e32 v25, v3
+; VI-NEXT: v_mov_b32_e32 v26, v3
+; VI-NEXT: v_mov_b32_e32 v27, v3
+; VI-NEXT: v_mov_b32_e32 v28, v3
+; VI-NEXT: v_mov_b32_e32 v29, v3
+; VI-NEXT: v_mov_b32_e32 v30, v3
+; VI-NEXT: v_mov_b32_e32 v31, v3
+; VI-NEXT: v_mov_b32_e32 v32, v3
+; VI-NEXT: v_mov_b32_e32 v33, v3
+; VI-NEXT: v_mov_b32_e32 v34, v3
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB134_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: .LBB134_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
+; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
+; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
+; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64bf16_to_v128i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mov_b32_e32 v12, v3
+; GFX9-NEXT: v_mov_b32_e32 v13, v3
+; GFX9-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-NEXT: v_mov_b32_e32 v15, v3
+; GFX9-NEXT: v_mov_b32_e32 v16, v3
+; GFX9-NEXT: v_mov_b32_e32 v17, v3
+; GFX9-NEXT: v_mov_b32_e32 v18, v3
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: v_mov_b32_e32 v20, v3
+; GFX9-NEXT: v_mov_b32_e32 v21, v3
+; GFX9-NEXT: v_mov_b32_e32 v22, v3
+; GFX9-NEXT: v_mov_b32_e32 v23, v3
+; GFX9-NEXT: v_mov_b32_e32 v24, v3
+; GFX9-NEXT: v_mov_b32_e32 v25, v3
+; GFX9-NEXT: v_mov_b32_e32 v26, v3
+; GFX9-NEXT: v_mov_b32_e32 v27, v3
+; GFX9-NEXT: v_mov_b32_e32 v28, v3
+; GFX9-NEXT: v_mov_b32_e32 v29, v3
+; GFX9-NEXT: v_mov_b32_e32 v30, v3
+; GFX9-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-NEXT: v_mov_b32_e32 v32, v3
+; GFX9-NEXT: v_mov_b32_e32 v33, v3
+; GFX9-NEXT: v_mov_b32_e32 v34, v3
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB134_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: .LBB134_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64bf16_to_v128i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_mov_b32_e32 v35, 0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v36, v35
+; GFX11-NEXT: v_mov_b32_e32 v37, v35
+; GFX11-NEXT: v_mov_b32_e32 v38, v35
+; GFX11-NEXT: v_mov_b32_e32 v39, v35
+; GFX11-NEXT: v_mov_b32_e32 v40, v35
+; GFX11-NEXT: v_mov_b32_e32 v41, v35
+; GFX11-NEXT: v_mov_b32_e32 v42, v35
+; GFX11-NEXT: v_mov_b32_e32 v43, v35
+; GFX11-NEXT: v_mov_b32_e32 v44, v35
+; GFX11-NEXT: v_mov_b32_e32 v45, v35
+; GFX11-NEXT: v_mov_b32_e32 v46, v35
+; GFX11-NEXT: v_mov_b32_e32 v47, v35
+; GFX11-NEXT: v_mov_b32_e32 v48, v35
+; GFX11-NEXT: v_mov_b32_e32 v49, v35
+; GFX11-NEXT: v_mov_b32_e32 v50, v35
+; GFX11-NEXT: v_mov_b32_e32 v51, v35
+; GFX11-NEXT: v_mov_b32_e32 v52, v35
+; GFX11-NEXT: v_mov_b32_e32 v53, v35
+; GFX11-NEXT: v_mov_b32_e32 v54, v35
+; GFX11-NEXT: v_mov_b32_e32 v55, v35
+; GFX11-NEXT: v_mov_b32_e32 v56, v35
+; GFX11-NEXT: v_mov_b32_e32 v57, v35
+; GFX11-NEXT: v_mov_b32_e32 v58, v35
+; GFX11-NEXT: v_mov_b32_e32 v59, v35
+; GFX11-NEXT: v_mov_b32_e32 v60, v35
+; GFX11-NEXT: v_mov_b32_e32 v61, v35
+; GFX11-NEXT: v_mov_b32_e32 v62, v35
+; GFX11-NEXT: v_mov_b32_e32 v63, v35
+; GFX11-NEXT: v_mov_b32_e32 v64, v35
+; GFX11-NEXT: v_mov_b32_e32 v65, v35
+; GFX11-NEXT: v_mov_b32_e32 v66, v35
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB134_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
+; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
+; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
+; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
+; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
+; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
+; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
+; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
+; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
+; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
+; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
+; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
+; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
+; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
+; GFX11-NEXT: .LBB134_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2081,8 +29916,782 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16i64:
+
define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v64bf16_to_v16i64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: v_mov_b32_e32 v39, v31
+; GCN-NEXT: v_mov_b32_e32 v40, v31
+; GCN-NEXT: v_mov_b32_e32 v41, v31
+; GCN-NEXT: v_mov_b32_e32 v42, v31
+; GCN-NEXT: v_mov_b32_e32 v43, v31
+; GCN-NEXT: v_mov_b32_e32 v44, v31
+; GCN-NEXT: v_mov_b32_e32 v45, v31
+; GCN-NEXT: v_mov_b32_e32 v46, v31
+; GCN-NEXT: v_mov_b32_e32 v56, v31
+; GCN-NEXT: v_mov_b32_e32 v57, v31
+; GCN-NEXT: v_mov_b32_e32 v58, v31
+; GCN-NEXT: v_mov_b32_e32 v59, v31
+; GCN-NEXT: v_mov_b32_e32 v60, v31
+; GCN-NEXT: v_mov_b32_e32 v61, v31
+; GCN-NEXT: v_mov_b32_e32 v62, v31
+; GCN-NEXT: v_mov_b32_e32 v63, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB135_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
+; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
+; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
+; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
+; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
+; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
+; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
+; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
+; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
+; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
+; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
+; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
+; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
+; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
+; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
+; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
+; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
+; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
+; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
+; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
+; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
+; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
+; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
+; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
+; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
+; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
+; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
+; GCN-NEXT: .LBB135_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64bf16_to_v16i64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: v_mov_b32_e32 v8, v3
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v10, v3
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: v_mov_b32_e32 v12, v3
+; VI-NEXT: v_mov_b32_e32 v13, v3
+; VI-NEXT: v_mov_b32_e32 v14, v3
+; VI-NEXT: v_mov_b32_e32 v15, v3
+; VI-NEXT: v_mov_b32_e32 v16, v3
+; VI-NEXT: v_mov_b32_e32 v17, v3
+; VI-NEXT: v_mov_b32_e32 v18, v3
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: v_mov_b32_e32 v20, v3
+; VI-NEXT: v_mov_b32_e32 v21, v3
+; VI-NEXT: v_mov_b32_e32 v22, v3
+; VI-NEXT: v_mov_b32_e32 v23, v3
+; VI-NEXT: v_mov_b32_e32 v24, v3
+; VI-NEXT: v_mov_b32_e32 v25, v3
+; VI-NEXT: v_mov_b32_e32 v26, v3
+; VI-NEXT: v_mov_b32_e32 v27, v3
+; VI-NEXT: v_mov_b32_e32 v28, v3
+; VI-NEXT: v_mov_b32_e32 v29, v3
+; VI-NEXT: v_mov_b32_e32 v30, v3
+; VI-NEXT: v_mov_b32_e32 v31, v3
+; VI-NEXT: v_mov_b32_e32 v32, v3
+; VI-NEXT: v_mov_b32_e32 v33, v3
+; VI-NEXT: v_mov_b32_e32 v34, v3
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB135_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: .LBB135_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
+; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
+; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
+; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64bf16_to_v16i64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mov_b32_e32 v12, v3
+; GFX9-NEXT: v_mov_b32_e32 v13, v3
+; GFX9-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-NEXT: v_mov_b32_e32 v15, v3
+; GFX9-NEXT: v_mov_b32_e32 v16, v3
+; GFX9-NEXT: v_mov_b32_e32 v17, v3
+; GFX9-NEXT: v_mov_b32_e32 v18, v3
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: v_mov_b32_e32 v20, v3
+; GFX9-NEXT: v_mov_b32_e32 v21, v3
+; GFX9-NEXT: v_mov_b32_e32 v22, v3
+; GFX9-NEXT: v_mov_b32_e32 v23, v3
+; GFX9-NEXT: v_mov_b32_e32 v24, v3
+; GFX9-NEXT: v_mov_b32_e32 v25, v3
+; GFX9-NEXT: v_mov_b32_e32 v26, v3
+; GFX9-NEXT: v_mov_b32_e32 v27, v3
+; GFX9-NEXT: v_mov_b32_e32 v28, v3
+; GFX9-NEXT: v_mov_b32_e32 v29, v3
+; GFX9-NEXT: v_mov_b32_e32 v30, v3
+; GFX9-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-NEXT: v_mov_b32_e32 v32, v3
+; GFX9-NEXT: v_mov_b32_e32 v33, v3
+; GFX9-NEXT: v_mov_b32_e32 v34, v3
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB135_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: .LBB135_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64bf16_to_v16i64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_mov_b32_e32 v35, 0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v36, v35
+; GFX11-NEXT: v_mov_b32_e32 v37, v35
+; GFX11-NEXT: v_mov_b32_e32 v38, v35
+; GFX11-NEXT: v_mov_b32_e32 v39, v35
+; GFX11-NEXT: v_mov_b32_e32 v40, v35
+; GFX11-NEXT: v_mov_b32_e32 v41, v35
+; GFX11-NEXT: v_mov_b32_e32 v42, v35
+; GFX11-NEXT: v_mov_b32_e32 v43, v35
+; GFX11-NEXT: v_mov_b32_e32 v44, v35
+; GFX11-NEXT: v_mov_b32_e32 v45, v35
+; GFX11-NEXT: v_mov_b32_e32 v46, v35
+; GFX11-NEXT: v_mov_b32_e32 v47, v35
+; GFX11-NEXT: v_mov_b32_e32 v48, v35
+; GFX11-NEXT: v_mov_b32_e32 v49, v35
+; GFX11-NEXT: v_mov_b32_e32 v50, v35
+; GFX11-NEXT: v_mov_b32_e32 v51, v35
+; GFX11-NEXT: v_mov_b32_e32 v52, v35
+; GFX11-NEXT: v_mov_b32_e32 v53, v35
+; GFX11-NEXT: v_mov_b32_e32 v54, v35
+; GFX11-NEXT: v_mov_b32_e32 v55, v35
+; GFX11-NEXT: v_mov_b32_e32 v56, v35
+; GFX11-NEXT: v_mov_b32_e32 v57, v35
+; GFX11-NEXT: v_mov_b32_e32 v58, v35
+; GFX11-NEXT: v_mov_b32_e32 v59, v35
+; GFX11-NEXT: v_mov_b32_e32 v60, v35
+; GFX11-NEXT: v_mov_b32_e32 v61, v35
+; GFX11-NEXT: v_mov_b32_e32 v62, v35
+; GFX11-NEXT: v_mov_b32_e32 v63, v35
+; GFX11-NEXT: v_mov_b32_e32 v64, v35
+; GFX11-NEXT: v_mov_b32_e32 v65, v35
+; GFX11-NEXT: v_mov_b32_e32 v66, v35
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB135_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
+; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
+; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
+; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
+; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
+; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
+; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
+; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
+; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
+; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
+; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
+; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
+; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
+; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
+; GFX11-NEXT: .LBB135_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
@@ -2097,8 +30706,782 @@ end:
ret void
}
-; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16f64:
+
define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+; GCN-LABEL: v_bitcast_v64bf16_to_v16f64:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_mov_b32_e32 v31, 0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v32, v31
+; GCN-NEXT: v_mov_b32_e32 v33, v31
+; GCN-NEXT: v_mov_b32_e32 v34, v31
+; GCN-NEXT: v_mov_b32_e32 v35, v31
+; GCN-NEXT: v_mov_b32_e32 v36, v31
+; GCN-NEXT: v_mov_b32_e32 v37, v31
+; GCN-NEXT: v_mov_b32_e32 v38, v31
+; GCN-NEXT: v_mov_b32_e32 v48, v31
+; GCN-NEXT: v_mov_b32_e32 v49, v31
+; GCN-NEXT: v_mov_b32_e32 v50, v31
+; GCN-NEXT: v_mov_b32_e32 v51, v31
+; GCN-NEXT: v_mov_b32_e32 v52, v31
+; GCN-NEXT: v_mov_b32_e32 v53, v31
+; GCN-NEXT: v_mov_b32_e32 v54, v31
+; GCN-NEXT: v_mov_b32_e32 v55, v31
+; GCN-NEXT: v_mov_b32_e32 v39, v31
+; GCN-NEXT: v_mov_b32_e32 v40, v31
+; GCN-NEXT: v_mov_b32_e32 v41, v31
+; GCN-NEXT: v_mov_b32_e32 v42, v31
+; GCN-NEXT: v_mov_b32_e32 v43, v31
+; GCN-NEXT: v_mov_b32_e32 v44, v31
+; GCN-NEXT: v_mov_b32_e32 v45, v31
+; GCN-NEXT: v_mov_b32_e32 v46, v31
+; GCN-NEXT: v_mov_b32_e32 v56, v31
+; GCN-NEXT: v_mov_b32_e32 v57, v31
+; GCN-NEXT: v_mov_b32_e32 v58, v31
+; GCN-NEXT: v_mov_b32_e32 v59, v31
+; GCN-NEXT: v_mov_b32_e32 v60, v31
+; GCN-NEXT: v_mov_b32_e32 v61, v31
+; GCN-NEXT: v_mov_b32_e32 v62, v31
+; GCN-NEXT: v_mov_b32_e32 v63, v31
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB136_2
+; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16
+; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47
+; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50
+; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51
+; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60
+; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45
+; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42
+; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43
+; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44
+; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46
+; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56
+; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57
+; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16
+; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16
+; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16
+; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16
+; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16
+; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16
+; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16
+; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16
+; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16
+; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16
+; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16
+; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16
+; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16
+; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16
+; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16
+; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16
+; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16
+; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16
+; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16
+; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16
+; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16
+; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16
+; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16
+; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16
+; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16
+; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16
+; GCN-NEXT: .LBB136_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(6)
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(5)
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_bitcast_v64bf16_to_v16f64:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_mov_b32_e32 v5, v3
+; VI-NEXT: v_mov_b32_e32 v6, v3
+; VI-NEXT: v_mov_b32_e32 v7, v3
+; VI-NEXT: v_mov_b32_e32 v8, v3
+; VI-NEXT: v_mov_b32_e32 v9, v3
+; VI-NEXT: v_mov_b32_e32 v10, v3
+; VI-NEXT: v_mov_b32_e32 v11, v3
+; VI-NEXT: v_mov_b32_e32 v12, v3
+; VI-NEXT: v_mov_b32_e32 v13, v3
+; VI-NEXT: v_mov_b32_e32 v14, v3
+; VI-NEXT: v_mov_b32_e32 v15, v3
+; VI-NEXT: v_mov_b32_e32 v16, v3
+; VI-NEXT: v_mov_b32_e32 v17, v3
+; VI-NEXT: v_mov_b32_e32 v18, v3
+; VI-NEXT: v_mov_b32_e32 v19, v3
+; VI-NEXT: v_mov_b32_e32 v20, v3
+; VI-NEXT: v_mov_b32_e32 v21, v3
+; VI-NEXT: v_mov_b32_e32 v22, v3
+; VI-NEXT: v_mov_b32_e32 v23, v3
+; VI-NEXT: v_mov_b32_e32 v24, v3
+; VI-NEXT: v_mov_b32_e32 v25, v3
+; VI-NEXT: v_mov_b32_e32 v26, v3
+; VI-NEXT: v_mov_b32_e32 v27, v3
+; VI-NEXT: v_mov_b32_e32 v28, v3
+; VI-NEXT: v_mov_b32_e32 v29, v3
+; VI-NEXT: v_mov_b32_e32 v30, v3
+; VI-NEXT: v_mov_b32_e32 v31, v3
+; VI-NEXT: v_mov_b32_e32 v32, v3
+; VI-NEXT: v_mov_b32_e32 v33, v3
+; VI-NEXT: v_mov_b32_e32 v34, v3
+; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: s_cbranch_execz .LBB136_2
+; VI-NEXT: ; %bb.1: ; %if
+; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT: .LBB136_2: ; %end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1
+; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1
+; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1
+; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1
+; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1
+; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1
+; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1
+; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
+; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
+; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bitcast_v64bf16_to_v16f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT: s_waitcnt vmcnt(28)
+; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mov_b32_e32 v12, v3
+; GFX9-NEXT: v_mov_b32_e32 v13, v3
+; GFX9-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-NEXT: v_mov_b32_e32 v15, v3
+; GFX9-NEXT: v_mov_b32_e32 v16, v3
+; GFX9-NEXT: v_mov_b32_e32 v17, v3
+; GFX9-NEXT: v_mov_b32_e32 v18, v3
+; GFX9-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-NEXT: v_mov_b32_e32 v20, v3
+; GFX9-NEXT: v_mov_b32_e32 v21, v3
+; GFX9-NEXT: v_mov_b32_e32 v22, v3
+; GFX9-NEXT: v_mov_b32_e32 v23, v3
+; GFX9-NEXT: v_mov_b32_e32 v24, v3
+; GFX9-NEXT: v_mov_b32_e32 v25, v3
+; GFX9-NEXT: v_mov_b32_e32 v26, v3
+; GFX9-NEXT: v_mov_b32_e32 v27, v3
+; GFX9-NEXT: v_mov_b32_e32 v28, v3
+; GFX9-NEXT: v_mov_b32_e32 v29, v3
+; GFX9-NEXT: v_mov_b32_e32 v30, v3
+; GFX9-NEXT: v_mov_b32_e32 v31, v3
+; GFX9-NEXT: v_mov_b32_e32 v32, v3
+; GFX9-NEXT: v_mov_b32_e32 v33, v3
+; GFX9-NEXT: v_mov_b32_e32 v34, v3
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB136_2
+; GFX9-NEXT: ; %bb.1: ; %if
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT: .LBB136_2: ; %end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_bitcast_v64bf16_to_v16f64:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_mov_b32_e32 v35, 0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v36, v35
+; GFX11-NEXT: v_mov_b32_e32 v37, v35
+; GFX11-NEXT: v_mov_b32_e32 v38, v35
+; GFX11-NEXT: v_mov_b32_e32 v39, v35
+; GFX11-NEXT: v_mov_b32_e32 v40, v35
+; GFX11-NEXT: v_mov_b32_e32 v41, v35
+; GFX11-NEXT: v_mov_b32_e32 v42, v35
+; GFX11-NEXT: v_mov_b32_e32 v43, v35
+; GFX11-NEXT: v_mov_b32_e32 v44, v35
+; GFX11-NEXT: v_mov_b32_e32 v45, v35
+; GFX11-NEXT: v_mov_b32_e32 v46, v35
+; GFX11-NEXT: v_mov_b32_e32 v47, v35
+; GFX11-NEXT: v_mov_b32_e32 v48, v35
+; GFX11-NEXT: v_mov_b32_e32 v49, v35
+; GFX11-NEXT: v_mov_b32_e32 v50, v35
+; GFX11-NEXT: v_mov_b32_e32 v51, v35
+; GFX11-NEXT: v_mov_b32_e32 v52, v35
+; GFX11-NEXT: v_mov_b32_e32 v53, v35
+; GFX11-NEXT: v_mov_b32_e32 v54, v35
+; GFX11-NEXT: v_mov_b32_e32 v55, v35
+; GFX11-NEXT: v_mov_b32_e32 v56, v35
+; GFX11-NEXT: v_mov_b32_e32 v57, v35
+; GFX11-NEXT: v_mov_b32_e32 v58, v35
+; GFX11-NEXT: v_mov_b32_e32 v59, v35
+; GFX11-NEXT: v_mov_b32_e32 v60, v35
+; GFX11-NEXT: v_mov_b32_e32 v61, v35
+; GFX11-NEXT: v_mov_b32_e32 v62, v35
+; GFX11-NEXT: v_mov_b32_e32 v63, v35
+; GFX11-NEXT: v_mov_b32_e32 v64, v35
+; GFX11-NEXT: v_mov_b32_e32 v65, v35
+; GFX11-NEXT: v_mov_b32_e32 v66, v35
+; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT: s_cbranch_execz .LBB136_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33
+; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31
+; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29
+; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27
+; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25
+; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23
+; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21
+; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17
+; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15
+; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13
+; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11
+; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9
+; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7
+; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5
+; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3
+; GFX11-NEXT: .LBB136_2: ; %end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112
+; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96
+; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80
+; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64
+; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48
+; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32
+; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16
+; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%cmp0 = icmp eq i32 %cond, 0
br i1 %cmp0, label %if, label %end
More information about the llvm-commits
mailing list