[llvm] c7952d8 - AMDGPU: Add a few more bfloat codegen tests

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 21 21:31:54 PST 2023


Author: Matt Arsenault
Date: 2023-12-22T12:31:42+07:00
New Revision: c7952d886060bf2fed7d3a4c08d5636db642a3c1

URL: https://github.com/llvm/llvm-project/commit/c7952d886060bf2fed7d3a4c08d5636db642a3c1
DIFF: https://github.com/llvm/llvm-project/commit/c7952d886060bf2fed7d3a4c08d5636db642a3c1.diff

LOG: AMDGPU: Add a few more bfloat codegen tests

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a63a46c010a90b..85a24a063aa4e1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -72,432 +72,441 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_f32_to_bf16:
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_f32_to_bf16:
+; GFX7-LABEL: v_load_global_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_f32_to_bf16:
+; GFX8-LABEL: v_load_global_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f32_to_bf16:
+; GFX9-LABEL: v_load_global_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_f32_to_bf16:
+; GFX10-LABEL: v_load_global_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_f32_to_bf16:
+; GFX11-LABEL: v_load_global_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load float, ptr addrspace(1) %in
-  %val.bf16 = fptrunc float %val to bfloat
-  store bfloat %val.bf16, ptr addrspace(1) %out
-  ret void
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  ret <2 x bfloat> %load
 }
 
-define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_f64_to_bf16:
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_f64_to_bf16:
+; GFX7-LABEL: v_load_global_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_f64_to_bf16:
+; GFX8-LABEL: v_load_global_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f64_to_bf16:
+; GFX9-LABEL: v_load_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_f64_to_bf16:
+; GFX10-LABEL: v_load_global_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_f64_to_bf16:
+; GFX11-LABEL: v_load_global_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load double, ptr addrspace(1) %in
-  %val.bf16 = fptrunc double %val to bfloat
-  store bfloat %val.bf16, ptr addrspace(1) %out
-  ret void
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  ret <3 x bfloat> %load
 }
 
-define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_bf16_to_f32:
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_bf16_to_f32:
+; GFX7-LABEL: v_load_global_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_bf16_to_f32:
+; GFX8-LABEL: v_load_global_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_bf16_to_f32:
+; GFX9-LABEL: v_load_global_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_bf16_to_f32:
+; GFX10-LABEL: v_load_global_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_bf16_to_f32:
+; GFX11-LABEL: v_load_global_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val.f32 = fpext bfloat %val to float
-  store float %val.f32, ptr addrspace(1) %out
-  ret void
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  ret <4 x bfloat> %load
 }
 
-define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_bf16_to_f64:
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_bf16_to_f64:
+; GFX7-LABEL: v_load_global_v6bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_bf16_to_f64:
+; GFX8-LABEL: v_load_global_v6bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_bf16_to_f64:
+; GFX9-LABEL: v_load_global_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_bf16_to_f64:
+; GFX10-LABEL: v_load_global_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_bf16_to_f64:
+; GFX11-LABEL: v_load_global_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val.f64 = fpext bfloat %val to double
-  store double %val.f64, ptr addrspace(1) %out
-  ret void
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  ret <6 x bfloat> %load
 }
 
-define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v2bf16:
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7-LABEL: v_load_global_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8-LABEL: v_load_global_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9-LABEL: v_load_global_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10-LABEL: v_load_global_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v2bf16:
+; GFX11-LABEL: v_load_global_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x bfloat>, ptr addrspace(1) %in
-  store <2 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  ret <8 x bfloat> %load
 }
 
-define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v4bf16:
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7-LABEL: v_load_global_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8-LABEL: v_load_global_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9-LABEL: v_load_global_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10-LABEL: v_load_global_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v4bf16:
+; GFX11-LABEL: v_load_global_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[0:3], v[4:5], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x bfloat>, ptr addrspace(1) %in
-  store <4 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  ret <16 x bfloat> %load
 }
 
-define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v8bf16:
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
@@ -505,12 +514,48 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7-LABEL: v_load_global_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -518,256 +563,438 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8-LABEL: v_load_global_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v0
+; GFX8-NEXT:    v_mov_b32_e32 v13, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v12
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v12
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v12
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9-LABEL: v_load_global_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10-LABEL: v_load_global_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v8bf16:
+; GFX11-LABEL: v_load_global_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b128 v[0:3], v[12:13], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <8 x bfloat>, ptr addrspace(1) %in
-  store <8 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  ret <32 x bfloat> %load
 }
 
-define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v16bf16:
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x68, v0
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x60, v0
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 0x5c, v0
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 0x58, v0
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 0x54, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v16, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v17, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 56, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v5, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT:    buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v8, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GCN-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7-LABEL: v_load_global_v64bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 52, v0
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, 48, v0
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 44, v0
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 32, v0
+; GFX7-NEXT:    v_add_i32_e32 v25, vcc, 28, v0
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 24, v0
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 20, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x64, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x60, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x54, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x50, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v3, v20, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v9, v22, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v8, v23, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    buffer_store_dword v14, v25, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v12, v27, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8-LABEL: v_load_global_v64bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v28, v0
+; GFX8-NEXT:    v_mov_b32_e32 v29, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v28
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v28
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v28
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 64, v28
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x50
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9-LABEL: v_load_global_v64bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10-LABEL: v_load_global_v64bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-NEXT:    v_mov_b32_e32 v33, v1
+; GFX10-NEXT:    v_mov_b32_e32 v32, v0
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[32:33], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[32:33], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[32:33], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[32:33], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[32:33], off offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[32:33], off offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[32:33], off offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[32:33], off offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v16bf16:
+; GFX11-LABEL: v_load_global_v64bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_load_b128 v[0:3], v[28:29], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX11-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX11-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX11-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX11-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <16 x bfloat>, ptr addrspace(1) %in
-  store <16 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+  ret <64 x bfloat> %load
 }
 
-define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store:
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store:
+; GFX7-LABEL: v_store_global_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store:
+; GFX8-LABEL: v_store_global_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[1:2], v0
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store:
+; GFX9-LABEL: v_store_global_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store:
+; GFX10-LABEL: v_store_global_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store:
+; GFX11-LABEL: v_store_global_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
+  store <2 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v2bf16:
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_dword v[1:2], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_arg_store_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x bfloat> %in, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
-; GCN-LABEL: test_arg_store_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -777,7 +1004,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7-LABEL: v_store_global_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -792,7 +1019,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8-LABEL: v_store_global_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
@@ -802,7 +1029,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9-LABEL: v_store_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
@@ -810,26 +1037,26 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10-LABEL: v_store_global_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v3bf16:
+; GFX11-LABEL: v_store_global_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
+  store <3 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v4bf16:
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -844,7 +1071,7 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7-LABEL: v_store_global_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -859,37 +1086,37 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8-LABEL: v_store_global_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9-LABEL: v_store_global_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10-LABEL: v_store_global_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v4bf16:
+; GFX11-LABEL: v_store_global_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <4 x bfloat> %in, ptr addrspace(1)  %out
+  store <4 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v8bf16:
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
@@ -908,7 +1135,7 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7-LABEL: v_store_global_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -927,37 +1154,37 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8-LABEL: v_store_global_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9-LABEL: v_store_global_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10-LABEL: v_store_global_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v8bf16:
+; GFX11-LABEL: v_store_global_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <8 x bfloat> %in, ptr addrspace(1) %out
+  store <8 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v16bf16:
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -985,7 +1212,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7-LABEL: v_store_global_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
@@ -1013,7 +1240,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8-LABEL: v_store_global_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -1024,7 +1251,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9-LABEL: v_store_global_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
@@ -1032,5969 +1259,9930 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10-LABEL: v_store_global_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v16bf16:
+; GFX11-LABEL: v_store_global_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <16 x bfloat> %in, ptr addrspace(1) %out
+  store <16 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_inreg_arg_store:
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s34, s4, 16
-; GCN-NEXT:    s_mov_b32 s38, 0
-; GCN-NEXT:    s_mov_b32 s39, 0xf000
-; GCN-NEXT:    s_mov_b32 s36, s38
-; GCN-NEXT:    s_mov_b32 s37, s38
-; GCN-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GCN-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_inreg_arg_store:
+; GFX7-LABEL: v_store_global_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX7-NEXT:    s_mov_b32 s38, 0
-; GFX7-NEXT:    s_mov_b32 s39, 0xf000
-; GFX7-NEXT:    s_mov_b32 s36, s38
-; GFX7-NEXT:    s_mov_b32 s37, s38
-; GFX7-NEXT:    v_mov_b32_e32 v2, s34
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX7-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_inreg_arg_store:
+; GFX8-LABEL: v_store_global_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s34
-; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_inreg_arg_store:
+; GFX9-LABEL: v_store_global_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_inreg_arg_store:
+; GFX10-LABEL: v_store_global_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_inreg_arg_store:
+; GFX11-LABEL: v_store_global_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[16:17], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
+  store <32 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
-; GCN-LABEL: test_byval:
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v21, v21, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v20, v19, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:132
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v16, 16
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v12, v4, 16
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v13, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v7, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_alignbit_b32 v6, v15, v24, 16
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v30, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v10, v11, 16
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GCN-NEXT:    v_alignbit_b32 v10, v1, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v22, v15, v16, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v21, v20, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v20, v15, v0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v19, v0, v14, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[17:18], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[17:18], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_byval:
+; GFX7-LABEL: v_store_global_v64bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v8, v21, v20, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_alignbit_b32 v9, v23, v22, 16
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v23, v6, v7, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v15, v31, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v21, v20, v33, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v34
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v7, v6, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v20, v32, v14, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    buffer_store_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v29
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX7-NEXT:    v_alignbit_b32 v16, v15, v28, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; GFX7-NEXT:    v_alignbit_b32 v17, v14, v30, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v25
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_alignbit_b32 v14, v14, v24, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_alignbit_b32 v21, v19, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_alignbit_b32 v20, v19, v34, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v36
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_alignbit_b32 v19, v19, v37, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v23, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v38
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v24, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v39
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v18, v48, 16
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v29, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v18, v22, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v30, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v23, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v22, v28, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v34, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_byval:
+; GFX8-LABEL: v_store_global_v64bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    s_movk_i32 s5, 0x50
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, s4, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[28:31]
+; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s5, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v32
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 48, v32
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 32, v32
+; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, 16, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_byval:
+; GFX9-LABEL: v_store_global_v64bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_byval:
+; GFX10-LABEL: v_store_global_v64bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_byval:
+; GFX11-LABEL: v_store_global_v64bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX11-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX11-NEXT:    global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX11-NEXT:    global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX11-NEXT:    global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[32:33], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %bv
-  %retval = load bfloat, ptr addrspace(5) %bv
-  ret bfloat %retval
+  store <64 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
 }
 
-define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
-; GCN-LABEL: test_sret:
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_sret:
+; GFX7-LABEL: test_store_fpimm:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_sret:
+; GFX8-LABEL: test_store_fpimm:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_sret:
+; GFX9-LABEL: test_store_fpimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT:    global_store_short v[0:1], v4, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_sret:
+; GFX10-LABEL: test_store_fpimm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT:    global_store_short v[0:1], v4, off
+; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_sret:
+; GFX11-LABEL: test_store_fpimm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %sret
+  store bfloat 1.0, ptr addrspace(1) %ptr0
+  store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
 }
 
-define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_bitcast_from_bfloat:
+define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_f32_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_bitcast_from_bfloat:
+; GFX7-LABEL: test_load_store_f32_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_bitcast_from_bfloat:
+; GFX8-LABEL: test_load_store_f32_to_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_bitcast_from_bfloat:
+; GFX9-LABEL: test_load_store_f32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_bitcast_from_bfloat:
+; GFX10-LABEL: test_load_store_f32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_bitcast_from_bfloat:
+; GFX11-LABEL: test_load_store_f32_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val_int = bitcast bfloat %val to i16
-  store i16 %val_int, ptr addrspace(1) %out
+  %val = load float, ptr addrspace(1) %in
+  %val.bf16 = fptrunc float %val to bfloat
+  store bfloat %val.bf16, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-LABEL: test_bitcast_to_bfloat:
+define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_f64_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_bitcast_to_bfloat:
+; GFX7-LABEL: test_load_store_f64_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_bitcast_to_bfloat:
+; GFX8-LABEL: test_load_store_f64_to_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_bitcast_to_bfloat:
+; GFX9-LABEL: test_load_store_f64_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_bitcast_to_bfloat:
+; GFX10-LABEL: test_load_store_f64_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_bitcast_to_bfloat:
+; GFX11-LABEL: test_load_store_f64_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i16, ptr addrspace(1) %in
-  %val_fp = bitcast i16 %val to bfloat
-  store bfloat %val_fp, ptr addrspace(1) %out
+  %val = load double, ptr addrspace(1) %in
+  %val.bf16 = fptrunc double %val to bfloat
+  store bfloat %val.bf16, ptr addrspace(1) %out
   ret void
 }
 
-define bfloat @test_ret(bfloat %in) {
-; GCN-LABEL: test_ret:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_bf16_to_f32:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_bf16_to_f32:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_bf16_to_f32:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_bf16_to_f32:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_bf16_to_f32:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_bf16_to_f32:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret bfloat %in
+  %val = load bfloat, ptr addrspace(1) %in
+  %val.f32 = fpext bfloat %val to float
+  store float %val.f32, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
-; GCN-LABEL: test_ret_v2bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_bf16_to_f64:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_bf16_to_f64:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_bf16_to_f64:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_bf16_to_f64:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_bf16_to_f64:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_bf16_to_f64:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <2 x bfloat> %in
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) %in
+  %val.f64 = fpext bfloat %val to double
+  store double %val.f64, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
-; GCN-LABEL: test_ret_v3bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v2bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v3bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v2bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <3 x bfloat> %in
+  %val = load <2 x bfloat>, ptr addrspace(1) %in
+  store <2 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
-; GCN-LABEL: test_ret_v4bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v4bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v4bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v4bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <4 x bfloat> %in
+  %val = load <4 x bfloat>, ptr addrspace(1) %in
+  store <4 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
-; GCN-LABEL: test_ret_v8bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v8bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v8bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v8bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <8 x bfloat> %in
+  %val = load <8 x bfloat>, ptr addrspace(1) %in
+  store <8 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
-; GCN-LABEL: test_ret_v16bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v16bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v16bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_load_store_v16bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <16 x bfloat> %in
+  %val = load <16 x bfloat>, ptr addrspace(1) %in
+  store <16 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define void @test_call(bfloat %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v2, s30, 0
-; GCN-NEXT:    v_writelane_b32 v2, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
-; GCN-NEXT:    v_readlane_b32 s30, v2, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_short v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call bfloat @test_arg_store(bfloat %in)
-  store volatile bfloat %result, ptr addrspace(5) %out
+  store bfloat %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v2bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v2bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v3, s30, 0
-; GCN-NEXT:    v_writelane_b32 v3, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v3, 1
-; GCN-NEXT:    v_readlane_b32 s30, v3, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v2bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v2bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v2bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v2bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v2bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
-  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+  store <2 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v3bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
+; GCN-LABEL: test_arg_store_v3bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v4, s30, 0
-; GCN-NEXT:    v_writelane_b32 v4, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
-; GCN-NEXT:    v_readlane_b32 s30, v4, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[2:3], v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v3bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v3bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
-  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
   ret void
 }
 
-define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v4bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v4bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v5, s30, 0
-; GCN-NEXT:    v_writelane_b32 v5, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
-; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
-; GCN-NEXT:    v_readlane_b32 s30, v5, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
-; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v4bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v4bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
-  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+  store <4 x bfloat> %in, ptr addrspace(1)  %out
   ret void
 }
 
-define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v8bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v8bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v9, s30, 0
-; GCN-NEXT:    v_writelane_b32 v9, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
-; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v9, 1
-; GCN-NEXT:    v_readlane_b32 s30, v9, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v10, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
-; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
-; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
-; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v8bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v8bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
-  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+  store <8 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v16bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v16bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v17, s30, 0
-; GCN-NEXT:    v_writelane_b32 v17, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
-; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
-; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
-; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v17, 1
-; GCN-NEXT:    v_readlane_b32 s30, v17, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v18, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v19, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT:    buffer_store_short v14, v15, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 26, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    buffer_store_short v13, v14, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 24, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    buffer_store_short v12, v13, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 22, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    buffer_store_short v11, v12, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 20, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    buffer_store_short v10, v11, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 18, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    buffer_store_short v9, v10, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 16, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    buffer_store_short v8, v9, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 14, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v12, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
-; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
-; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
-; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
-; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
-; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
-; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
-; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v16bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v16bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
-; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
-; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
-  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+  store <16 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define bfloat @test_alloca_load_store_ret(bfloat %in) {
-; GCN-LABEL: test_alloca_load_store_ret:
-; GCN:       ; %bb.0: ; %entry
+define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_inreg_arg_store:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_lshr_b32 s34, s4, 16
+; GCN-NEXT:    s_mov_b32 s38, 0
+; GCN-NEXT:    s_mov_b32 s39, 0xf000
+; GCN-NEXT:    s_mov_b32 s36, s38
+; GCN-NEXT:    s_mov_b32 s37, s38
+; GCN-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_alloca_load_store_ret:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_inreg_arg_store:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT:    s_lshr_b32 s34, s4, 16
+; GFX7-NEXT:    s_mov_b32 s38, 0
+; GFX7-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7-NEXT:    s_mov_b32 s36, s38
+; GFX7-NEXT:    s_mov_b32 s37, s38
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_alloca_load_store_ret:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_inreg_arg_store:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_inreg_arg_store:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_alloca_load_store_ret:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_inreg_arg_store:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_alloca_load_store_ret:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_inreg_arg_store:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %in.addr = alloca bfloat, align 2, addrspace(5)
-  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
-  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
-  ret bfloat %loaded
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
 }
 
-define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
-; GCN-LABEL: test_overflow_stack:
+define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
+; GCN-LABEL: test_byval:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_byval:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_byval:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_byval:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_byval:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_byval:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %val, ptr addrspace(5) %bv
+  %retval = load bfloat, ptr addrspace(5) %bv
+  ret bfloat %retval
+}
+
+define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
+; GCN-LABEL: test_sret:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x60, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v27, v30, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
-; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
-; GCN-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
-; GCN-NEXT:    buffer_store_dword v22, v27, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
-; GCN-NEXT:    buffer_store_dword v21, v30, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
-; GCN-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
-; GCN-NEXT:    buffer_store_dword v19, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 12, v0
-; GCN-NEXT:    buffer_store_dword v18, v25, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v26, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_overflow_stack:
+; GFX7-LABEL: test_sret:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x74, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; GFX7-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; GFX7-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
-; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; GFX7-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; GFX7-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
-; GFX7-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
-; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
-; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
-; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
-; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
-; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_overflow_stack:
+; GFX8-LABEL: test_sret:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x78, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x74, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
-; GFX8-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
-; GFX8-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
-; GFX8-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
-; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
-; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
-; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
-; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
-; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
-; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
-; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
-; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
-; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
-; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
-; GFX8-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
-; GFX8-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
-; GFX8-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
-; GFX8-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
-; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
-; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
-; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
-; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
-; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
-; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
-; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_overflow_stack:
+; GFX9-LABEL: test_sret:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_overflow_stack:
+; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_overflow_stack:
+; GFX11-LABEL: test_sret:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
-  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
-  ret { <32 x i32>, bfloat } %ins.1
+  store bfloat %val, ptr addrspace(5) %sret
+  ret void
 }
 
-define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v2bf16_to_v2f32:
+define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_bitcast_from_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX7-LABEL: test_bitcast_from_bfloat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX8-LABEL: test_bitcast_from_bfloat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX9-LABEL: test_bitcast_from_bfloat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX10-LABEL: test_bitcast_from_bfloat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX11-LABEL: test_bitcast_from_bfloat:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <2 x bfloat> %load to <2 x float>
-  ret <2 x float> %fpext
+  %val = load bfloat, ptr addrspace(1) %in
+  %val_int = bitcast bfloat %val to i16
+  store i16 %val_int, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v3bf16_to_v3f32:
+define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: test_bitcast_to_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX7-LABEL: test_bitcast_to_bfloat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX8-LABEL: test_bitcast_to_bfloat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX9-LABEL: test_bitcast_to_bfloat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX10-LABEL: test_bitcast_to_bfloat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX11-LABEL: test_bitcast_to_bfloat:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <3 x bfloat> %load to <3 x float>
-  ret <3 x float> %fpext
+  %val = load i16, ptr addrspace(1) %in
+  %val_fp = bitcast i16 %val to bfloat
+  store bfloat %val_fp, ptr addrspace(1) %out
+  ret void
 }
 
-define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v4bf16_to_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+define bfloat @test_ret(bfloat %in) {
+; GCN-LABEL: test_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <4 x bfloat> %load to <4 x float>
-  ret <4 x float> %fpext
+entry:
+  ret bfloat %in
 }
 
-define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v5bf16_to_v5f32:
-; GCN:       ; %bb.0:
+define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
+; GCN-LABEL: test_ret_v2bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <5 x bfloat> %load to <5 x float>
-  ret <5 x float> %fpext
+entry:
+  ret <2 x bfloat> %in
 }
 
-define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v6bf16_to_v6f32:
-; GCN:       ; %bb.0:
+define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
+; GCN-LABEL: test_ret_v3bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx3 v[3:5], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v3bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <6 x bfloat> %load to <6 x float>
-  ret <6 x float> %fpext
+entry:
+  ret <3 x bfloat> %in
 }
 
-define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v8bf16_to_v8f32:
-; GCN:       ; %bb.0:
+define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
+; GCN-LABEL: test_ret_v4bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v4bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <8 x bfloat> %load to <8 x float>
-  ret <8 x float> %fpext
+entry:
+  ret <4 x bfloat> %in
 }
 
-define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v16bf16_to_v16f32:
-; GCN:       ; %bb.0:
+define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
+; GCN-LABEL: test_ret_v8bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v8bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <16 x bfloat> %load to <16 x float>
-  ret <16 x float> %fpext
+entry:
+  ret <8 x bfloat> %in
 }
 
-define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v32bf16_to_v32f32:
-; GCN:       ; %bb.0:
+define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
+; GCN-LABEL: test_ret_v16bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[2:3]
-; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v16bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <32 x bfloat> %load to <32 x float>
-  ret <32 x float> %fpext
+entry:
+  ret <16 x bfloat> %in
 }
 
-define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v2bf16_to_v2f64:
-; GCN:       ; %bb.0:
+define void @test_call(bfloat %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v2, s30, 0
+; GCN-NEXT:    v_writelane_b32 v2, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
+; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <2 x bfloat> %load to <2 x double>
-  ret <2 x double> %fpext
+entry:
+  %result = call bfloat @test_arg_store(bfloat %in)
+  store volatile bfloat %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v3bf16_to_v3f64:
-; GCN:       ; %bb.0:
+define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v2bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v3, s30, 0
+; GCN-NEXT:    v_writelane_b32 v3, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v3, 1
+; GCN-NEXT:    v_readlane_b32 s30, v3, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <3 x bfloat> %load to <3 x double>
-  ret <3 x double> %fpext
+entry:
+  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
+  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v4bf16_to_v4f64:
-; GCN:       ; %bb.0:
+define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v3bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v4, s30, 0
+; GCN-NEXT:    v_writelane_b32 v4, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
+; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v3bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <4 x bfloat> %load to <4 x double>
-  ret <4 x double> %fpext
+entry:
+  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
+  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v5bf16_to_v5f64:
-; GCN:       ; %bb.0:
+define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v4bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v5, s30, 0
+; GCN-NEXT:    v_writelane_b32 v5, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
+; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
+; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
+; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v4bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <5 x bfloat> %load to <5 x double>
-  ret <5 x double> %fpext
+entry:
+  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
+  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v6bf16_to_v6f64:
-; GCN:       ; %bb.0:
+define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v8bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v9, s30, 0
+; GCN-NEXT:    v_writelane_b32 v9, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
+; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v9, 1
+; GCN-NEXT:    v_readlane_b32 s30, v9, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
+; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
+; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
+; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v8bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <6 x bfloat> %load to <6 x double>
-  ret <6 x double> %fpext
+entry:
+  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
+  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v8bf16_to_v8f64:
-; GCN:       ; %bb.0:
+define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v16bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <8 x bfloat> %load to <8 x double>
-  ret <8 x double> %fpext
-}
-
-define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v16bf16_to_v16f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v17, s30, 0
+; GCN-NEXT:    v_writelane_b32 v17, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
+; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
+; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
+; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v17, 1
+; GCN-NEXT:    v_readlane_b32 s30, v17, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    buffer_store_short v14, v15, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 26, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    buffer_store_short v13, v14, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 24, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    buffer_store_short v12, v13, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 22, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    buffer_store_short v11, v12, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 20, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    buffer_store_short v10, v11, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 18, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    buffer_store_short v9, v10, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 16, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    buffer_store_short v8, v9, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 14, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
+; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
+; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
+; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
+; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
+; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
+; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
+; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v16bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
+; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
+; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v26
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v27
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <16 x bfloat> %load to <16 x double>
-  ret <16 x double> %fpext
+entry:
+  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
+  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v32bf16_to_v32f64:
+define bfloat @test_alloca_load_store_ret(bfloat %in) {
+; GCN-LABEL: test_alloca_load_store_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_alloca_load_store_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_alloca_load_store_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_alloca_load_store_ret:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_alloca_load_store_ret:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_alloca_load_store_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %in.addr = alloca bfloat, align 2, addrspace(5)
+  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
+  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
+  ret bfloat %loaded
+}
+
+define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
+; GCN-LABEL: test_overflow_stack:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
-; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
-; GCN-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
-; GCN-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
-; GCN-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
-; GCN-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
-; GCN-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
-; GCN-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
-; GCN-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
-; GCN-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
-; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
-; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
-; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
-; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
-; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
-; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
-; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GCN-NEXT:    s_waitcnt vmcnt(8)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0xfc, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xf4, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
+; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x60, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT:    buffer_store_dword v27, v30, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xdc, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xd4, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xd0, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xcc, v0
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xc8, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xc4, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
+; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xb8, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
+; GCN-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xb0, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xac, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xa8, v0
-; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa0, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x9c, v0
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v22, v27, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x94, v0
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v21, v30, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x88, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
+; GCN-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x7c, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
+; GCN-NEXT:    buffer_store_dword v19, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v18, v25, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x64, v0
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x60, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x58, v0
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 0x50, v0
-; GCN-NEXT:    buffer_store_dword v1, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x4c, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x48, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 64, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 60, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 48, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 44, v0
-; GCN-NEXT:    buffer_store_dword v1, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
-; GCN-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
-; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 28, v0
-; GCN-NEXT:    v_add_i32_e32 v34, vcc, 24, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
-; GCN-NEXT:    buffer_store_dword v1, v24, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
-; GCN-NEXT:    v_add_i32_e32 v35, vcc, 12, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
-; GCN-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
-; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v9
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[7:8], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v30, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v29, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[11:12], v36
-; GCN-NEXT:    buffer_store_dword v3, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[13:14], v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
-; GCN-NEXT:    buffer_store_dword v6, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v17, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v33, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v25, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v13, v34, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v35, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX7-LABEL: test_overflow_stack:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
-; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
-; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
-; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
-; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
-; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
-; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
-; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GFX7-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
-; GFX7-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
-; GFX7-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
-; GFX7-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
-; GFX7-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
-; GFX7-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
-; GFX7-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
-; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
-; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
-; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xfc, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf4, v0
-; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0xd8, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xec, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe4, v0
-; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 0xd0, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xdc, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
-; GFX7-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xd4, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v20, v24, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v28
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xcc, v0
+; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x74, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; GFX7-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; GFX7-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc8, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc4, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v34
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xbc, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v32
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb4, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb0, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xac, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa4, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v30
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v29
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x9c, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x98, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x94, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x90, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x8c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; GFX7-NEXT:    buffer_store_dword v21, v18, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x88, v0
-; GFX7-NEXT:    buffer_store_dword v20, v18, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x84, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX7-NEXT:    buffer_store_dword v21, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x80, v0
-; GFX7-NEXT:    buffer_store_dword v20, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x7c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    buffer_store_dword v2, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x74, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x6c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 0x68, v0
-; GFX7-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x64, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_dword v11, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v8
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x5c, v0
-; GFX7-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    buffer_store_dword v16, v8, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[5:6], v16
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x4c, v0
-; GFX7-NEXT:    buffer_store_dword v4, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
-; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x44, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
-; GFX7-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
-; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
-; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
-; GFX7-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; GFX7-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; GFX7-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
+; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
+; GFX7-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; GFX7-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
-; GFX7-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
-; GFX7-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX8-LABEL: test_overflow_stack:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 2, v1
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 6, v1
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 8, v1
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 10, v1
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 12, v1
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 14, v1
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 16, v1
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 18, v1
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 20, v1
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 22, v1
-; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 24, v1
-; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 26, v1
-; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v29, vcc, 28, v1
-; GFX8-NEXT:    v_addc_u32_e32 v30, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 30, v1
-; GFX8-NEXT:    v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 32, v1
-; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 34, v1
-; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 36, v1
-; GFX8-NEXT:    flat_load_ushort v43, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 38, v1
-; GFX8-NEXT:    v_addc_u32_e32 v49, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 62, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v44, v[50:51]
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 60, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v45, v[50:51]
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 40, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 58, v1
-; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v46, v[52:53]
-; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 42, v1
-; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 56, v1
-; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v47, v[54:55]
-; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 44, v1
-; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 54, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v56, v[39:40]
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 52, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v57, v[39:40]
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 46, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v41, vcc, 50, v1
-; GFX8-NEXT:    v_addc_u32_e32 v42, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v41, v[41:42]
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 48, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v42, v[9:10]
-; GFX8-NEXT:    flat_load_ushort v9, v[35:36]
-; GFX8-NEXT:    flat_load_ushort v10, v[37:38]
-; GFX8-NEXT:    flat_load_ushort v35, v[48:49]
-; GFX8-NEXT:    flat_load_ushort v36, v[50:51]
-; GFX8-NEXT:    flat_load_ushort v37, v[52:53]
-; GFX8-NEXT:    flat_load_ushort v48, v[54:55]
-; GFX8-NEXT:    flat_load_ushort v39, v[39:40]
-; GFX8-NEXT:    flat_load_ushort v49, v[1:2]
-; GFX8-NEXT:    flat_load_ushort v50, v[3:4]
-; GFX8-NEXT:    flat_load_ushort v51, v[5:6]
-; GFX8-NEXT:    flat_load_ushort v52, v[7:8]
-; GFX8-NEXT:    flat_load_ushort v53, v[11:12]
-; GFX8-NEXT:    flat_load_ushort v38, v[13:14]
-; GFX8-NEXT:    flat_load_ushort v14, v[17:18]
-; GFX8-NEXT:    flat_load_ushort v11, v[21:22]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v0
-; GFX8-NEXT:    flat_load_ushort v15, v[15:16]
-; GFX8-NEXT:    flat_load_ushort v13, v[19:20]
-; GFX8-NEXT:    flat_load_ushort v8, v[23:24]
-; GFX8-NEXT:    flat_load_ushort v6, v[25:26]
-; GFX8-NEXT:    flat_load_ushort v5, v[27:28]
-; GFX8-NEXT:    flat_load_ushort v7, v[29:30]
-; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
-; GFX8-NEXT:    flat_load_ushort v16, v[33:34]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xc4, v0
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xbc, v0
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xb4, v0
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xac, v0
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xa4, v0
-; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x9c, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xfc, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf8, v0
-; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf4, v0
-; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xf0, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xec, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xe8, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe4, v0
-; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe0, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xdc, v0
-; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xd8, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xd4, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xd0, v0
-; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xcc, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
-; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xc8, v0
-; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v50
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
-; GFX8-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xc0, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v38
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xb8, v0
-; GFX8-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xb0, v0
-; GFX8-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    buffer_store_dword v22, v24, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xa8, v0
-; GFX8-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v23
-; GFX8-NEXT:    buffer_store_dword v24, v26, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xa0, v0
-; GFX8-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v35
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v25
-; GFX8-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v10
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0x98, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x94, v0
-; GFX8-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x90, v0
-; GFX8-NEXT:    buffer_store_dword v27, v11, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v14
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x8c, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
-; GFX8-NEXT:    buffer_store_dword v28, v14, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x88, v0
-; GFX8-NEXT:    buffer_store_dword v27, v14, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x84, v0
-; GFX8-NEXT:    buffer_store_dword v28, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x80, v0
-; GFX8-NEXT:    buffer_store_dword v27, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v9
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7c, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x78, v0
-; GFX8-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x74, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    buffer_store_dword v7, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x70, v0
-; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x6c, v0
-; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x68, v0
-; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x64, v0
-; GFX8-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x60, v0
-; GFX8-NEXT:    buffer_store_dword v12, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x5c, v0
-; GFX8-NEXT:    buffer_store_dword v9, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x58, v0
-; GFX8-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x54, v0
-; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x50, v0
-; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
-; GFX8-NEXT:    buffer_store_dword v15, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x48, v0
-; GFX8-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x44, v0
-; GFX8-NEXT:    buffer_store_dword v11, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 52, v0
-; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 48, v0
-; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 44, v0
-; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 40, v0
-; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 36, v0
-; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v0
-; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 28, v0
-; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 24, v0
-; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 20, v0
-; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 12, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x78, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x74, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
+; GFX8-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
+; GFX8-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
+; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
+; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
+; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
+; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
+; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
+; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
+; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
+; GFX8-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
+; GFX8-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
+; GFX8-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
+; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
+; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_overflow_stack:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_overflow_stack:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_overflow_stack:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
+  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
+  ret { <32 x i32>, bfloat } %ins.1
+}
+
+define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v2bf16_to_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <2 x bfloat> %load to <2 x float>
+  ret <2 x float> %fpext
+}
+
+define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v3bf16_to_v3f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <3 x bfloat> %load to <3 x float>
+  ret <3 x float> %fpext
+}
+
+define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v4bf16_to_v4f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <4 x bfloat> %load to <4 x float>
+  ret <4 x float> %fpext
+}
+
+define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v5bf16_to_v5f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <5 x bfloat> %load to <5 x float>
+  ret <5 x float> %fpext
+}
+
+define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v6bf16_to_v6f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[3:5], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <6 x bfloat> %load to <6 x float>
+  ret <6 x float> %fpext
+}
+
+define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v8bf16_to_v8f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <8 x bfloat> %load to <8 x float>
+  ret <8 x float> %fpext
+}
+
+define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v16bf16_to_v16f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <16 x bfloat> %load to <16 x float>
+  ret <16 x float> %fpext
+}
+
+define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v32bf16_to_v32f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
+; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <32 x bfloat> %load to <32 x float>
+  ret <32 x float> %fpext
+}
+
+define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v2bf16_to_v2f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <2 x bfloat> %load to <2 x double>
+  ret <2 x double> %fpext
+}
+
+define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v3bf16_to_v3f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <3 x bfloat> %load to <3 x double>
+  ret <3 x double> %fpext
+}
+
+define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v4bf16_to_v4f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <4 x bfloat> %load to <4 x double>
+  ret <4 x double> %fpext
+}
+
+define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v5bf16_to_v5f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <5 x bfloat> %load to <5 x double>
+  ret <5 x double> %fpext
+}
+
+define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v6bf16_to_v6f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <6 x bfloat> %load to <6 x double>
+  ret <6 x double> %fpext
+}
+
+define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v8bf16_to_v8f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <8 x bfloat> %load to <8 x double>
+  ret <8 x double> %fpext
+}
+
+define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v16bf16_to_v16f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v26
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v27
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <16 x bfloat> %load to <16 x double>
+  ret <16 x double> %fpext
+}
+
+define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v32bf16_to_v32f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
+; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
+; GCN-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
+; GCN-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
+; GCN-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
+; GCN-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
+; GCN-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
+; GCN-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
+; GCN-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
+; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
+; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
+; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
+; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
+; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
+; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
+; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0xfc, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xf4, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xdc, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xd4, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xd0, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xcc, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xc4, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xbc, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xb8, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xac, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xa8, v0
+; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xa4, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa0, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x9c, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x94, v0
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x8c, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x88, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x74, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x64, v0
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x60, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x58, v0
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v1, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x4c, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x48, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 60, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v1, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
+; GCN-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
+; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 28, v0
+; GCN-NEXT:    v_add_i32_e32 v34, vcc, 24, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
+; GCN-NEXT:    buffer_store_dword v1, v24, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
+; GCN-NEXT:    v_add_i32_e32 v35, vcc, 12, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
+; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v9
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[7:8], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[11:12], v36
+; GCN-NEXT:    buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[13:14], v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
+; GCN-NEXT:    buffer_store_dword v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v17, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v31, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v33, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v34, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v35, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
+; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
+; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
+; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
+; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
+; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
+; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
+; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
+; GFX7-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
+; GFX7-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
+; GFX7-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
+; GFX7-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
+; GFX7-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
+; GFX7-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
+; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
+; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
+; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
+; GFX7-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xfc, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf4, v0
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0xd8, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xec, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe4, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 0xd0, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xdc, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; GFX7-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xd4, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v20, v24, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v28
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xcc, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc8, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc4, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v34
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xbc, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v32
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb4, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb0, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xac, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v30
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v29
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x9c, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x98, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x94, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x90, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x8c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
+; GFX7-NEXT:    buffer_store_dword v21, v18, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x88, v0
+; GFX7-NEXT:    buffer_store_dword v20, v18, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x84, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX7-NEXT:    buffer_store_dword v21, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x80, v0
+; GFX7-NEXT:    buffer_store_dword v20, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x7c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    buffer_store_dword v2, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x74, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x6c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 0x68, v0
+; GFX7-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x64, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_dword v11, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v8
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x5c, v0
+; GFX7-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    buffer_store_dword v16, v8, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
+; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[5:6], v16
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x4c, v0
+; GFX7-NEXT:    buffer_store_dword v4, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
+; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x44, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
+; GFX7-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
+; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; GFX7-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; GFX7-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; GFX7-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; GFX7-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; GFX7-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 2, v1
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 6, v1
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 8, v1
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 10, v1
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 12, v1
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 14, v1
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 16, v1
+; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 18, v1
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 20, v1
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 22, v1
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 24, v1
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 26, v1
+; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v29, vcc, 28, v1
+; GFX8-NEXT:    v_addc_u32_e32 v30, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 30, v1
+; GFX8-NEXT:    v_addc_u32_e32 v32, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 32, v1
+; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 34, v1
+; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 36, v1
+; GFX8-NEXT:    flat_load_ushort v43, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 38, v1
+; GFX8-NEXT:    v_addc_u32_e32 v49, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 62, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v44, v[50:51]
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 60, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v45, v[50:51]
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 40, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 58, v1
+; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v46, v[52:53]
+; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 42, v1
+; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 56, v1
+; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v47, v[54:55]
+; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 44, v1
+; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 54, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v56, v[39:40]
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 52, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v57, v[39:40]
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 46, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v41, vcc, 50, v1
+; GFX8-NEXT:    v_addc_u32_e32 v42, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v41, v[41:42]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 48, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v42, v[9:10]
+; GFX8-NEXT:    flat_load_ushort v9, v[35:36]
+; GFX8-NEXT:    flat_load_ushort v10, v[37:38]
+; GFX8-NEXT:    flat_load_ushort v35, v[48:49]
+; GFX8-NEXT:    flat_load_ushort v36, v[50:51]
+; GFX8-NEXT:    flat_load_ushort v37, v[52:53]
+; GFX8-NEXT:    flat_load_ushort v48, v[54:55]
+; GFX8-NEXT:    flat_load_ushort v39, v[39:40]
+; GFX8-NEXT:    flat_load_ushort v49, v[1:2]
+; GFX8-NEXT:    flat_load_ushort v50, v[3:4]
+; GFX8-NEXT:    flat_load_ushort v51, v[5:6]
+; GFX8-NEXT:    flat_load_ushort v52, v[7:8]
+; GFX8-NEXT:    flat_load_ushort v53, v[11:12]
+; GFX8-NEXT:    flat_load_ushort v38, v[13:14]
+; GFX8-NEXT:    flat_load_ushort v14, v[17:18]
+; GFX8-NEXT:    flat_load_ushort v11, v[21:22]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v0
+; GFX8-NEXT:    flat_load_ushort v15, v[15:16]
+; GFX8-NEXT:    flat_load_ushort v13, v[19:20]
+; GFX8-NEXT:    flat_load_ushort v8, v[23:24]
+; GFX8-NEXT:    flat_load_ushort v6, v[25:26]
+; GFX8-NEXT:    flat_load_ushort v5, v[27:28]
+; GFX8-NEXT:    flat_load_ushort v7, v[29:30]
+; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
+; GFX8-NEXT:    flat_load_ushort v16, v[33:34]
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xc4, v0
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xbc, v0
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xb4, v0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xac, v0
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xa4, v0
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x9c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xfc, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf8, v0
+; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf4, v0
+; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xf0, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xec, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xe8, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe4, v0
+; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xdc, v0
+; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xd8, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xd4, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xd0, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xcc, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xc8, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v50
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
+; GFX8-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xc0, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xb8, v0
+; GFX8-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xb0, v0
+; GFX8-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    buffer_store_dword v22, v24, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xa8, v0
+; GFX8-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v23
+; GFX8-NEXT:    buffer_store_dword v24, v26, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xa0, v0
+; GFX8-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v35
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v25
+; GFX8-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v10
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0x98, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x94, v0
+; GFX8-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x90, v0
+; GFX8-NEXT:    buffer_store_dword v27, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v14
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x8c, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
+; GFX8-NEXT:    buffer_store_dword v28, v14, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x88, v0
+; GFX8-NEXT:    buffer_store_dword v27, v14, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x84, v0
+; GFX8-NEXT:    buffer_store_dword v28, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v27, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v9
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7c, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x78, v0
+; GFX8-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x74, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    buffer_store_dword v7, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x6c, v0
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x68, v0
+; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x64, v0
+; GFX8-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v12, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v9, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x54, v0
+; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x50, v0
+; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
+; GFX8-NEXT:    buffer_store_dword v15, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x48, v0
+; GFX8-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x44, v0
+; GFX8-NEXT:    buffer_store_dword v11, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
+; GFX8-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
+; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 52, v0
+; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 48, v0
+; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 44, v0
+; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 40, v0
+; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 36, v0
+; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 12, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:62
+; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:60
+; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:58
+; GFX9-NEXT:    global_load_ushort v25, v[1:2], off offset:56
+; GFX9-NEXT:    global_load_ushort v26, v[1:2], off offset:54
+; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:52
+; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:50
+; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:48
+; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:46
+; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:44
+; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:42
+; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:40
+; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:38
+; GFX9-NEXT:    global_load_ushort v19, v[1:2], off
+; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:36
+; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:2
+; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:4
+; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:34
+; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:32
+; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:6
+; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:8
+; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:30
+; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
+; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
+; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
+; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
+; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:24
+; GFX9-NEXT:    global_load_ushort v10, v[1:2], off offset:26
+; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:28
+; GFX9-NEXT:    global_load_ushort v9, v[1:2], off offset:10
+; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v24
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v25
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v26
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v27
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[27:28], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(27)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v29
+; GFX9-NEXT:    s_waitcnt vmcnt(26)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[29:30], v30
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v31
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[31:32], v32
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v11
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v22
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v23
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v12
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1f
+; GFX10-NEXT:    global_load_ushort v3, v[1:2], off
+; GFX10-NEXT:    global_load_ushort v4, v[1:2], off offset:2
+; GFX10-NEXT:    global_load_ushort v5, v[1:2], off offset:4
+; GFX10-NEXT:    global_load_ushort v6, v[1:2], off offset:6
+; GFX10-NEXT:    global_load_ushort v7, v[1:2], off offset:8
+; GFX10-NEXT:    global_load_ushort v8, v[1:2], off offset:10
+; GFX10-NEXT:    global_load_ushort v9, v[1:2], off offset:12
+; GFX10-NEXT:    global_load_ushort v10, v[1:2], off offset:14
+; GFX10-NEXT:    global_load_ushort v11, v[1:2], off offset:16
+; GFX10-NEXT:    global_load_ushort v12, v[1:2], off offset:18
+; GFX10-NEXT:    global_load_ushort v13, v[1:2], off offset:20
+; GFX10-NEXT:    global_load_ushort v14, v[1:2], off offset:22
+; GFX10-NEXT:    global_load_ushort v15, v[1:2], off offset:24
+; GFX10-NEXT:    global_load_ushort v16, v[1:2], off offset:26
+; GFX10-NEXT:    global_load_ushort v17, v[1:2], off offset:28
+; GFX10-NEXT:    global_load_ushort v18, v[1:2], off offset:30
+; GFX10-NEXT:    global_load_ushort v19, v[1:2], off offset:32
+; GFX10-NEXT:    global_load_ushort v20, v[1:2], off offset:34
+; GFX10-NEXT:    global_load_ushort v21, v[1:2], off offset:36
+; GFX10-NEXT:    global_load_ushort v22, v[1:2], off offset:38
+; GFX10-NEXT:    global_load_ushort v23, v[1:2], off offset:40
+; GFX10-NEXT:    global_load_ushort v24, v[1:2], off offset:42
+; GFX10-NEXT:    global_load_ushort v25, v[1:2], off offset:44
+; GFX10-NEXT:    global_load_ushort v26, v[1:2], off offset:46
+; GFX10-NEXT:    global_load_ushort v27, v[1:2], off offset:48
+; GFX10-NEXT:    global_load_ushort v28, v[1:2], off offset:62
+; GFX10-NEXT:    global_load_ushort v29, v[1:2], off offset:50
+; GFX10-NEXT:    global_load_ushort v30, v[1:2], off offset:52
+; GFX10-NEXT:    global_load_ushort v31, v[1:2], off offset:54
+; GFX10-NEXT:    global_load_ushort v32, v[1:2], off offset:60
+; GFX10-NEXT:    global_load_ushort v33, v[1:2], off offset:56
+; GFX10-NEXT:    global_load_ushort v34, v[1:2], off offset:58
+; GFX10-NEXT:    s_waitcnt vmcnt(31)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(29)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(28)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(27)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(26)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v9
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(22)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v12
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v13
+; GFX10-NEXT:    s_waitcnt vmcnt(20)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v14
+; GFX10-NEXT:    s_waitcnt vmcnt(19)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v15
+; GFX10-NEXT:    s_waitcnt vmcnt(18)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v37
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[15:16], v38
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(14)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v20
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v21
+; GFX10-NEXT:    s_waitcnt vmcnt(12)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    s_waitcnt vmcnt(11)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v23
+; GFX10-NEXT:    s_waitcnt vmcnt(10)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v24
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v25
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v26
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v29
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v29
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v84
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v13
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[25:26], v50
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[27:28], v51
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[50:51], v82
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[31:32], v52
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[33:34], v53
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[52:53], v80
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v35
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[9:10], v36
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v48
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[23:24], v49
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[35:36], v54
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[48:49], v55
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[54:55], v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v18
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
+; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v17
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v81
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v71
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v65
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[64:65], v64
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v67
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[66:67], v66
+; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v69
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[17:18], v39
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[68:69], v68
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
+; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
+; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
+; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:12
+; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:8
+; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:4
+; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:2
+; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
+; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
+; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
+; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
+; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:28
+; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:24
+; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:20
+; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:18
+; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
+; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
+; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
+; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
+; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:44
+; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:40
+; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:36
+; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:34
+; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
+; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
+; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
+; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
+; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:60
+; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:56
+; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:52
+; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:50
+; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
+; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
+; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
+; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s11, s0, 48
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v103, 16, v23
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v27
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v68
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v65
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v29
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v30
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[68:69], v53
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[66:67], v26
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[64:65], v52
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[54:55], v25
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[52:53], v49
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[50:51], v48
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[48:49], v21
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[23:24], v34
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[35:36], v22
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[33:34], v103
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[31:32], v18
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[29:30], v102
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[27:28], v17
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[25:26], v101
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[21:22], v13
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[19:20], v14
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[17:18], v100
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[15:16], v10
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[13:14], v39
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[9:10], v38
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[7:8], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
+; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <32 x bfloat> %load to <32 x double>
+  ret <32 x double> %fpext
+}
+
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fadd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
+; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
+; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
+; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
+; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
+; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
+; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 1.0
+  ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 42.0
+  ret bfloat %add
+}
+
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fsub_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:62
-; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:60
-; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:58
-; GFX9-NEXT:    global_load_ushort v25, v[1:2], off offset:56
-; GFX9-NEXT:    global_load_ushort v26, v[1:2], off offset:54
-; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:52
-; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:50
-; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:48
-; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:46
-; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:44
-; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:42
-; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:40
-; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:38
-; GFX9-NEXT:    global_load_ushort v19, v[1:2], off
-; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:36
-; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:4
-; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:34
-; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:32
-; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:6
-; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:30
-; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
-; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
-; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
-; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT:    global_load_ushort v10, v[1:2], off offset:26
-; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:28
-; GFX9-NEXT:    global_load_ushort v9, v[1:2], off offset:10
-; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v24
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v26
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v27
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[27:28], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(27)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v29
-; GFX9-NEXT:    s_waitcnt vmcnt(26)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[29:30], v30
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v31
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[31:32], v32
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v11
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v22
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v23
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v12
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1f
-; GFX10-NEXT:    global_load_ushort v3, v[1:2], off
-; GFX10-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; GFX10-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; GFX10-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; GFX10-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; GFX10-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; GFX10-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; GFX10-NEXT:    global_load_ushort v10, v[1:2], off offset:14
-; GFX10-NEXT:    global_load_ushort v11, v[1:2], off offset:16
-; GFX10-NEXT:    global_load_ushort v12, v[1:2], off offset:18
-; GFX10-NEXT:    global_load_ushort v13, v[1:2], off offset:20
-; GFX10-NEXT:    global_load_ushort v14, v[1:2], off offset:22
-; GFX10-NEXT:    global_load_ushort v15, v[1:2], off offset:24
-; GFX10-NEXT:    global_load_ushort v16, v[1:2], off offset:26
-; GFX10-NEXT:    global_load_ushort v17, v[1:2], off offset:28
-; GFX10-NEXT:    global_load_ushort v18, v[1:2], off offset:30
-; GFX10-NEXT:    global_load_ushort v19, v[1:2], off offset:32
-; GFX10-NEXT:    global_load_ushort v20, v[1:2], off offset:34
-; GFX10-NEXT:    global_load_ushort v21, v[1:2], off offset:36
-; GFX10-NEXT:    global_load_ushort v22, v[1:2], off offset:38
-; GFX10-NEXT:    global_load_ushort v23, v[1:2], off offset:40
-; GFX10-NEXT:    global_load_ushort v24, v[1:2], off offset:42
-; GFX10-NEXT:    global_load_ushort v25, v[1:2], off offset:44
-; GFX10-NEXT:    global_load_ushort v26, v[1:2], off offset:46
-; GFX10-NEXT:    global_load_ushort v27, v[1:2], off offset:48
-; GFX10-NEXT:    global_load_ushort v28, v[1:2], off offset:62
-; GFX10-NEXT:    global_load_ushort v29, v[1:2], off offset:50
-; GFX10-NEXT:    global_load_ushort v30, v[1:2], off offset:52
-; GFX10-NEXT:    global_load_ushort v31, v[1:2], off offset:54
-; GFX10-NEXT:    global_load_ushort v32, v[1:2], off offset:60
-; GFX10-NEXT:    global_load_ushort v33, v[1:2], off offset:56
-; GFX10-NEXT:    global_load_ushort v34, v[1:2], off offset:58
-; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v6
-; GFX10-NEXT:    s_waitcnt vmcnt(27)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT:    s_waitcnt vmcnt(26)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
-; GFX10-NEXT:    s_waitcnt vmcnt(25)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v9
-; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v10
-; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
-; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v12
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v13
-; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v14
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v15
-; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v37
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[15:16], v38
-; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v19
-; GFX10-NEXT:    s_waitcnt vmcnt(14)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v20
-; GFX10-NEXT:    s_waitcnt vmcnt(13)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v21
-; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT:    s_waitcnt vmcnt(11)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v23
-; GFX10-NEXT:    s_waitcnt vmcnt(10)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v24
-; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v25
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v26
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v29
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v29
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v84
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[25:26], v50
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[27:28], v51
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[50:51], v82
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[31:32], v52
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[33:34], v53
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[52:53], v80
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v35
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[9:10], v36
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v48
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[23:24], v49
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[35:36], v54
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[48:49], v55
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[54:55], v70
-; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v18
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
-; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v81
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v71
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v65
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[64:65], v64
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v67
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[66:67], v66
-; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v69
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[17:18], v39
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[68:69], v68
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
-; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
-; GFX10-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
-; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:12
-; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:8
-; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:4
-; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:2
-; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
-; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
-; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
-; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
-; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:28
-; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:24
-; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:20
-; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:18
-; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
-; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
-; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
-; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
-; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:44
-; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:40
-; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:36
-; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:34
-; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
-; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
-; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
-; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
-; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:60
-; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:56
-; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:52
-; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:50
-; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
-; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
-; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
-; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v15
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v19
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v103, 16, v23
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v27
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v68
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v65
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v29
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v30
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[68:69], v53
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[66:67], v26
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[64:65], v52
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[54:55], v25
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[52:53], v49
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[50:51], v48
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[48:49], v21
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[23:24], v34
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[35:36], v22
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[33:34], v103
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[31:32], v18
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[29:30], v102
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[27:28], v17
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[25:26], v101
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[21:22], v13
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[19:20], v14
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[17:18], v100
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[15:16], v10
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[13:14], v39
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[9:10], v38
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[7:8], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <32 x bfloat> %load to <32 x double>
-  ret <32 x double> %fpext
+  %op = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
 }
 
-define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fadd_bf16:
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fmul_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_bf16:
+; GFX7-LABEL: v_fmul_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_bf16:
+; GFX8-LABEL: v_fmul_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_bf16:
+; GFX9-LABEL: v_fmul_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_bf16:
+; GFX10-LABEL: v_fmul_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_bf16:
+; GFX11-LABEL: v_fmul_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd bfloat %a, %b
+  %op = fmul bfloat %a, %b
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v2bf16:
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7-LABEL: v_fmul_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8-LABEL: v_fmul_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9-LABEL: v_fmul_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10-LABEL: v_fmul_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v2bf16:
+; GFX11-LABEL: v_fmul_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
@@ -7002,17 +11190,17 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <2 x bfloat> %a, %b
+  %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v3bf16:
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -7021,15 +11209,15 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7-LABEL: v_fmul_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -7038,32 +11226,32 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
@@ -7072,15 +11260,15 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
@@ -7089,14 +11277,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11-LABEL: v_fmul_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -7105,20 +11293,20 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <3 x bfloat> %a, %b
+  %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v4bf16:
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -7129,17 +11317,17 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7-LABEL: v_fmul_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -7150,58 +11338,58 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
@@ -7212,16 +11400,16 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v4bf16:
+; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
@@ -7231,11 +11419,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
@@ -7243,12 +11431,12 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <4 x bfloat> %a, %b
+  %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v8bf16:
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -7267,14 +11455,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7285,7 +11473,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7-LABEL: v_fmul_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -7304,14 +11492,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7322,33 +11510,33 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -7356,18 +11544,18 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v9
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_mul_f32_e32 v10, v11, v10
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -7376,14 +11564,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
@@ -7392,7 +11580,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
@@ -7403,8 +11591,8 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -7413,13 +11601,13 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -7427,7 +11615,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v8bf16:
+; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
@@ -7439,22 +11627,22 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_mul_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -7463,47 +11651,47 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <8 x bfloat> %a, %b
+  %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v16bf16:
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
@@ -7514,10 +11702,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7534,17 +11722,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7-LABEL: v_fmul_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -7575,20 +11763,20 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7606,61 +11794,61 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v16
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
@@ -7672,30 +11860,30 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_mul_f32_e32 v18, v19, v18
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_mul_f32_e32 v19, v20, v19
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_mul_f32_e32 v20, v21, v20
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_mul_f32_e32 v21, v22, v21
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_mul_f32_e32 v22, v23, v22
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -7713,16 +11901,16 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
@@ -7734,7 +11922,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
@@ -7743,18 +11931,18 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
 ; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_mul_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_mul_f32_e32 v21, v26, v25
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
 ; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
@@ -7775,16 +11963,16 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_mul_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
@@ -7795,7 +11983,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v16bf16:
+; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
@@ -7815,17 +12003,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
-; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
+; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_mul_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v20, v24, v23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -7837,12 +12025,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
-; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
+; GFX11-NEXT:    v_mul_f32_e32 v23, v25, v24
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
@@ -7852,12 +12040,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <16 x bfloat> %a, %b
+  %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v32bf16:
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
@@ -7867,161 +12055,161 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_add_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
-; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_mul_f32_e32 v30, v30, v32
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_mul_f32_e32 v29, v29, v33
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
-; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_mul_f32_e32 v27, v27, v33
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_mul_f32_e32 v25, v25, v33
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_mul_f32_e32 v23, v23, v33
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_mul_f32_e32 v21, v21, v33
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
-; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_mul_f32_e32 v19, v19, v33
 ; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
-; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
 ; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_mul_f32_e32 v17, v17, v33
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v33
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v33
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v33
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v33
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v33
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v33
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v33
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v33
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -8056,7 +12244,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7-LABEL: v_fmul_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
@@ -8096,211 +12284,211 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX7-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
+; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
+; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
+; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
+; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
+; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
+; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
+; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
+; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
+; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
+; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
+; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
+; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
+; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
+; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
@@ -8314,56 +12502,56 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -8375,7 +12563,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
@@ -8414,19 +12602,19 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_mul_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_mul_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_mul_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_mul_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_mul_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_mul_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_mul_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_mul_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_mul_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_mul_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_mul_f32_e32 v55, v57, v56
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -8447,14 +12635,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
 ; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_mul_f32_e32 v34, v35, v34
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_mul_f32_e32 v36, v37, v36
 ; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_mul_f32_e32 v33, v35, v33
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
@@ -8483,20 +12671,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
@@ -8514,12 +12702,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_mul_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v31
 ; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -8543,30 +12731,30 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
 ; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_mul_f32_e32 v53, v54, v53
 ; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_mul_f32_e32 v55, v64, v55
 ; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_mul_f32_e32 v65, v66, v65
 ; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_mul_f32_e32 v67, v68, v67
 ; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
 ; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
 ; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
 ; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
 ; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
@@ -8595,26 +12783,26 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_mul_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_mul_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_mul_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_mul_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_mul_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
@@ -8633,12 +12821,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
 ; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v32bf16:
+; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
@@ -8656,20 +12844,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
 ; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
 ; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
 ; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
 ; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
 ; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
@@ -8678,19 +12866,19 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
 ; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
 ; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
 ; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
@@ -8698,20 +12886,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
 ; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
 ; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
@@ -8719,16 +12907,16 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
-; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
-; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
-; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
+; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
+; GFX11-NEXT:    v_mul_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
+; GFX11-NEXT:    v_mul_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
+; GFX11-NEXT:    v_mul_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_mul_f32_e32 v49, v66, v65
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
@@ -8746,530 +12934,634 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <32 x bfloat> %a, %b
-  ret <32 x bfloat> %op
+  %op = fmul <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fdiv_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
+; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fdiv_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fdiv bfloat %a, %b
+  ret bfloat %op
 }
 
-define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fsub_bf16:
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define bfloat @v_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_bf16:
+; GFX7-LABEL: v_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_bf16:
+; GFX8-LABEL: v_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_bf16:
+; GFX9-LABEL: v_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_bf16:
+; GFX10-LABEL: v_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fsub_bf16:
+; GFX11-LABEL: v_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub bfloat %a, %b
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v2bf16:
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define bfloat @v_fneg_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7-LABEL: v_fneg_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8-LABEL: v_fneg_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9-LABEL: v_fneg_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10-LABEL: v_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fneg bfloat %a
+  ret bfloat %op
+}
+
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: v_fsub_v2bf16:
+; GFX11-LABEL: s_fneg_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <2 x bfloat> %a, %b
-  ret <2 x bfloat> %op
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = fneg bfloat %a
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
-define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v3bf16:
+define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7-LABEL: v_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8-LABEL: v_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v3bf16:
+; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10-LABEL: v_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11-LABEL: v_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  ret bfloat %op
 }
 
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_bitset1_b32 s0, 15
+; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7-LABEL: s_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_bitset1_b32 s0, 15
+; GFX7-NEXT:    ; return to shader part epilog
 ;
-; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8-LABEL: s_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9-LABEL: s_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10-LABEL: s_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: v_fsub_v4bf16:
+; GFX11-LABEL: s_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
-define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fmul_bf16:
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_minnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_bf16:
+; GFX7-LABEL: v_minnum_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_bf16:
+; GFX8-LABEL: v_minnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_bf16:
+; GFX9-LABEL: v_minnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_bf16:
+; GFX10-LABEL: v_minnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_bf16:
+; GFX11-LABEL: v_minnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul bfloat %a, %b
+  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v2bf16:
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v2bf16:
+; GFX7-LABEL: v_minnum_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v2bf16:
+; GFX8-LABEL: v_minnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v2bf16:
+; GFX9-LABEL: v_minnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v2bf16:
+; GFX10-LABEL: v_minnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v2bf16:
+; GFX11-LABEL: v_minnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <2 x bfloat> %a, %b
+  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v3bf16:
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -9278,15 +13570,21 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v3bf16:
+; GFX7-LABEL: v_minnum_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -9295,87 +13593,115 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v3bf16:
+; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v3bf16:
+; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v3bf16:
+; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v3bf16:
+; GFX11-LABEL: v_minnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <3 x bfloat> %a, %b
+  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v4bf16:
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -9386,17 +13712,25 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v4bf16:
+; GFX7-LABEL: v_minnum_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -9407,58 +13741,82 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v4bf16:
+; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v4bf16:
+; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v4bf16:
+; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
@@ -9468,836 +13826,4775 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v4bf16:
+; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_min_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
+; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
+  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
 }
 
-define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fdiv_bf16:
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GCN-NEXT:    v_rcp_f32_e32 v3, v2
-; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
-; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fdiv_bf16:
+; GFX7-LABEL: v_minnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
-; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_bf16:
+; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_bf16:
+; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_min_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_min_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_min_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_bf16:
+; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX10-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v3
-; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_min_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_min_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fdiv_bf16:
+; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fdiv bfloat %a, %b
-  ret bfloat %op
+  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
 }
 
-declare bfloat @llvm.fabs.bf16(bfloat)
-
-define bfloat @v_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fabs_bf16:
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_min_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_min_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_min_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_min_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_min_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_min_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_min_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_min_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_min_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fabs_bf16:
+; GFX7-LABEL: v_minnum_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fabs_bf16:
+; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fabs_bf16:
+; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_min_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_min_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_min_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_min_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_min_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_min_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_min_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_min_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_min_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_min_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_min_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_min_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_min_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fabs_bf16:
+; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_min_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_min_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_min_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_min_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_min_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_min_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_min_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_min_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_min_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fabs_bf16:
+; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
+; GFX11-NEXT:    v_min_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
+; GFX11-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  ret bfloat %op
+  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
 }
 
-define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: s_fabs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX11-NEXT:    ; return to shader part epilog
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
 
-define bfloat @v_fneg_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_bf16:
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_maxnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fneg_bf16:
+; GFX7-LABEL: v_maxnum_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fneg_bf16:
+; GFX8-LABEL: v_maxnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fneg_bf16:
+; GFX9-LABEL: v_maxnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fneg_bf16:
+; GFX10-LABEL: v_maxnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_bf16:
+; GFX11-LABEL: v_maxnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fneg bfloat %a
+  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
 }
 
-declare i32 @llvm.amdgcn.readfirstlane(i32)
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: s_fneg_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
-  %op = fneg bfloat %a
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-define bfloat @v_fneg_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_fabs_bf16:
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fneg_fabs_bf16:
+; GFX7-LABEL: v_maxnum_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fneg_fabs_bf16:
+; GFX8-LABEL: v_maxnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fneg_fabs_bf16:
+; GFX9-LABEL: v_maxnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fneg_fabs_bf16:
+; GFX10-LABEL: v_maxnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_fabs_bf16:
+; GFX11-LABEL: v_maxnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  ret bfloat %op
+  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
 }
 
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_fabs_bf16:
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    s_bitset1_b32 s0, 15
-; GCN-NEXT:    ; return to shader part epilog
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: s_fneg_fabs_bf16:
+; GFX7-LABEL: v_maxnum_v3bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    s_bitset1_b32 s0, 15
-; GFX7-NEXT:    ; return to shader part epilog
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: s_fneg_fabs_bf16:
+; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_fneg_fabs_bf16:
+; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: s_fneg_fabs_bf16:
+; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: s_fneg_fabs_bf16:
+; GFX11-LABEL: v_maxnum_v3bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
 }
 
-declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-
-define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_minnum_bf16:
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_minnum_bf16:
+; GFX7-LABEL: v_maxnum_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_minnum_bf16:
+; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_bf16:
+; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_minnum_bf16:
+; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minnum_bf16:
+; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
+  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
 }
 
-define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v2bf16:
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_minnum_v2bf16:
+; GFX7-LABEL: v_maxnum_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_minnum_v2bf16:
+; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v2bf16:
+; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_minnum_v2bf16:
+; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minnum_v2bf16:
+; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
+; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
+  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
 }
 
-declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-
-define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_maxnum_bf16:
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_maxnum_bf16:
+; GFX7-LABEL: v_maxnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_maxnum_bf16:
+; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_bf16:
+; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_maxnum_bf16:
+; GFX10-LABEL: v_maxnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_max_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maxnum_bf16:
+; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
+  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
 }
 
-define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v2bf16:
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_max_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_max_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_max_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_max_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_max_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_max_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_max_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_max_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_max_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_maxnum_v2bf16:
+; GFX7-LABEL: v_maxnum_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_maxnum_v2bf16:
+; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v2bf16:
+; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_max_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_max_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_max_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_max_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_max_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_max_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_max_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_maxnum_v2bf16:
+; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_max_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_max_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_max_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_max_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_max_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_max_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_max_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maxnum_v2bf16:
+; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
+; GFX11-NEXT:    v_max_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
+; GFX11-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
+  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
 }
 
 declare bfloat @llvm.sqrt.bf16(bfloat)

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index de9e320a363a0c..f24cc6f177d625 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3087,8 +3087,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ret void
 }
 
-define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3) #0 {
-; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3, <4 x bfloat> %arg4) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3103,39 +3103,55 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v17
-; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v8, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v12, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v11, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v10, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v9, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3150,26 +3166,38 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3184,36 +3212,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v33
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3232,19 +3278,28 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
   store volatile <2 x half> %arg2, ptr addrspace(1) undef
   store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
+  store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 91cff9d1a5419d..844aa57de05ce3 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1582,8 +1582,8 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
   ret <6 x half> %shuffle
 }
 
-define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
-; GFX9-LABEL: fma_shuffle:
+define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1600,7 +1600,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: fma_shuffle:
+; GFX10-LABEL: fma_shuffle_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1619,7 +1619,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fma_shuffle:
+; GFX11-LABEL: fma_shuffle_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
@@ -1758,12 +1758,8 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
   ret void
 }
 
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
-define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16bits:
+define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1773,7 +1769,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16bits:
+; GFX10-LABEL: low16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1782,7 +1778,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16bits:
+; GFX11-LABEL: low16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1798,8 +1794,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16bits:
+define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1809,7 +1805,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16bits:
+; GFX10-LABEL: hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1818,7 +1814,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16bits:
+; GFX11-LABEL: hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1834,8 +1830,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16hi16bits:
+define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1845,7 +1841,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16hi16bits:
+; GFX10-LABEL: low16hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1854,7 +1850,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16hi16bits:
+; GFX11-LABEL: low16hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1870,8 +1866,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16low16bits:
+define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1880,7 +1876,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16low16bits:
+; GFX10-LABEL: hi16low16bits_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1889,7 +1885,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16low16bits:
+; GFX11-LABEL: hi16low16bits_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -2675,3 +2671,2354 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
   store <16 x i32> %shuffle, ptr addrspace(1) %out
   ret void
 }
+
+define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_23uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_23uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_23uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_234u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_234u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u1u3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u1u3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u1u3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3u1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3u1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3u1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3u6u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3u6u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3u6u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3uu7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3uu7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3uu7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_35u5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_35u5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_35u5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_357u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_357u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_357u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0145:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0145:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0145:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0167:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0167:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0167:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2301:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2301:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2301:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2323:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2323:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2323:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2345:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2345:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2345:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4501:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4501:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4501:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4523:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4523:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4523:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4545:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4545:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4545:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4567:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4567:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4567:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6701:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6701:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6701:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6723:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6723:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6723:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6745:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6745:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6745:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6767:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6767:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6767:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2356:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX9-NEXT:    v_and_or_b32 v0, v6, s4, v0
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2356:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v6, v0
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2356:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5623:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5623:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5623:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v2, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v2, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5634:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5634:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5634:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5734:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5734:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5734:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_perm_b32 v1, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0000:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0000:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0000:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1010:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1010:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1010:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1100:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s6, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s7, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_perm_b32 v2, v1, v1, s5
+; GFX9-NEXT:    v_and_or_b32 v3, v1, s4, v0
+; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s6
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1100:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1100:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6161:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6161:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6161:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2333:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2333:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2333:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6667:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6667:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6667:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_4589:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_4589:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_4589:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v3bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v3bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v3bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v2bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v2bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v2bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v6bf16_452367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX9-NEXT:    global_load_dword v7, v[3:4], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v6bf16_452367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX10-NEXT:    global_load_dword v7, v[3:4], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v6bf16_452367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
+  ret <6 x bfloat> %shuffle
+}
+
+define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v7, v9, v8, v7
+; GFX9-NEXT:    v_fma_f32 v0, v9, v2, v0
+; GFX9-NEXT:    v_fma_f32 v8, v11, v8, v12
+; GFX9-NEXT:    v_fma_f32 v1, v11, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX9-NEXT:    v_fma_f32 v0, v4, v10, v0
+; GFX9-NEXT:    v_fma_f32 v2, v4, v3, v2
+; GFX9-NEXT:    v_fma_f32 v1, v5, v10, v1
+; GFX9-NEXT:    v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: fma_shuffle_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v9, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v4, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v3
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fma_shuffle_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX11-NEXT:    v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp12 = zext i32 %tmp1 to i64
+  %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12
+  %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12
+  %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12
+  %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8
+  %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer
+  %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19)
+  %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+  %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20)
+  %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2>
+  %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27)
+  %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3>
+  %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28)
+  %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x1000504
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x1000504
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) {
+; GFX9-LABEL: v2bfloat_hi16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v2bfloat_hi16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2bfloat_hi16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0
+  %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1
+  %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vec.ret
+}
+
+define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v8bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v16bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v16bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v16bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v32bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v32bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v32bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }


        


More information about the llvm-commits mailing list