[llvm] [GlobalISel] Fix buildCopyFromRegs for split vectors (PR #77448)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 16 01:04:01 PST 2024


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/77448

>From 7a2c323d65fe291e91faefd7de45b034aafc16f6 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 12:47:47 +0100
Subject: [PATCH 1/6] [GlobalISel] Fix buildCopyFromRegs for split vectors

Fixes #77055
---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 34 ++++++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll  | 96 ++++++++++++++++++++
 2 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 2953433deff1f0..b69f90f057ccdc 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -478,9 +478,37 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
   } else {
     // Vector was split, and elements promoted to a wider type.
     // FIXME: Should handle floating point promotions.
-    LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
-    auto BV = B.buildBuildVector(BVType, Regs);
-    B.buildTrunc(OrigRegs[0], BV);
+    unsigned NumElts = LLTy.getNumElements();
+    LLT BVType = LLT::fixed_vector(NumElts, PartLLT);
+
+    Register BuildVec;
+    if (NumElts == Regs.size())
+      BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
+    else {
+      SmallVector<Register, 0> BVRegs;
+      BVRegs.reserve(NumElts);
+
+      // Vector elements are packed in the inputs.
+      // e.g. we have a <4 x s16> but 2 x s32 in regs.
+      assert(NumElts > Regs.size());
+      LLT SrcEltTy = MRI.getType(Regs[0]);
+      LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
+
+      // Input registers contain packed elements.
+      // Determine how many elements per reg.
+      assert((SrcEltTy.getSizeInBits() % OriginalEltTy.getSizeInBits()) == 0);
+      unsigned EltPerReg =
+          (SrcEltTy.getSizeInBits() / OriginalEltTy.getSizeInBits());
+
+      for (Register R : Regs) {
+        auto Unmerge = B.buildUnmerge(OriginalEltTy, R);
+        for (unsigned K = 0; K < EltPerReg; ++K)
+          BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
+      }
+      assert(BVRegs.size() == NumElts);
+      BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
+    }
+    B.buildTrunc(OrigRegs[0], BuildVec);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
new file mode 100644
index 00000000000000..3037b84b25775a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+
+; TODO: expand testcases - currently only contains cases that were known to crash.
+
+; assert in IRTranslator, #77055
+define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
+; GCN-LABEL: v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_or_b32_e32 v3, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+  ret <4 x bfloat> %res
+}

>From 9ce5390dbcba8ee9998e3f73770bdd92d8fe633c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 13:52:49 +0100
Subject: [PATCH 2/6] Fix v3bf16 cases + improve testing

---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp |     8 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll  | 13814 ++++++++++++++++-
 2 files changed, 13761 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index b69f90f057ccdc..5b6dc0e5e20c40 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -492,6 +492,7 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
       // e.g. we have a <4 x s16> but 2 x s32 in regs.
       assert(NumElts > Regs.size());
       LLT SrcEltTy = MRI.getType(Regs[0]);
+
       LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
 
       // Input registers contain packed elements.
@@ -505,7 +506,12 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
         for (unsigned K = 0; K < EltPerReg; ++K)
           BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
       }
-      assert(BVRegs.size() == NumElts);
+
+      // We may have some more elements in BVRegs, e.g. if we have 2 s32 pieces for a <3 x s16> vector. We should have less than EltPerReg extra items.
+      if(BVRegs.size() > NumElts) {
+        assert((BVRegs.size() - NumElts) < EltPerReg);
+        BVRegs.truncate(NumElts);
+      }
       BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
     }
     B.buildTrunc(OrigRegs[0], BuildVec);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
index 3037b84b25775a..aaefb634b132aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -4,93 +4,13787 @@
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
 
-; TODO: expand testcases - currently only contains cases that were known to crash.
+; FIXME: GFX11 cannot select some truncs: %0:vgpr_32(s16) = G_TRUNC %1:vgpr_32(s32)
+; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
 
-; assert in IRTranslator, #77055
-define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
-; GCN-LABEL: v4bf16:
+define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) %in
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}
+
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  ret <2 x bfloat> %load
+}
+
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  ret <3 x bfloat> %load
+}
+
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v2, v5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  ret <4 x bfloat> %load
+}
+
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GCN-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-NEXT:    v_mov_b32_e32 v2, v7
+; GCN-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v6bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v2, v7
+; GFX7-NEXT:    v_mov_b32_e32 v4, v8
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v6bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[2:4], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v6bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[2:4], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v6bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[2:4], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  ret <6 x bfloat> %load
+}
+
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v2, v9
+; GCN-NEXT:    v_mov_b32_e32 v4, v10
+; GCN-NEXT:    v_mov_b32_e32 v6, v11
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v4, v10
+; GFX7-NEXT:    v_mov_b32_e32 v6, v11
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  ret <8 x bfloat> %load
+}
+
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[23:26], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v25
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v21
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v22
+; GCN-NEXT:    v_mov_b32_e32 v0, v23
+; GCN-NEXT:    v_mov_b32_e32 v2, v24
+; GCN-NEXT:    v_mov_b32_e32 v4, v25
+; GCN-NEXT:    v_mov_b32_e32 v6, v26
+; GCN-NEXT:    v_mov_b32_e32 v8, v19
+; GCN-NEXT:    v_mov_b32_e32 v10, v20
+; GCN-NEXT:    v_mov_b32_e32 v12, v21
+; GCN-NEXT:    v_mov_b32_e32 v14, v22
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
+; GFX7-NEXT:    v_mov_b32_e32 v0, v22
+; GFX7-NEXT:    v_mov_b32_e32 v2, v23
+; GFX7-NEXT:    v_mov_b32_e32 v4, v24
+; GFX7-NEXT:    v_mov_b32_e32 v6, v25
+; GFX7-NEXT:    v_mov_b32_e32 v8, v18
+; GFX7-NEXT:    v_mov_b32_e32 v10, v19
+; GFX7-NEXT:    v_mov_b32_e32 v12, v20
+; GFX7-NEXT:    v_mov_b32_e32 v14, v21
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; GFX8-NEXT:    v_mov_b32_e32 v0, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v4, v10
+; GFX8-NEXT:    v_mov_b32_e32 v6, v11
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; GFX9-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_mov_b32_e32 v0, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10
+; GFX10-NEXT:    v_mov_b32_e32 v6, v11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  ret <16 x bfloat> %load
+}
+
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[39:42], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v38, 16, v34
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v36
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v39
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v40
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v41
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v42
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v48
+; GCN-NEXT:    buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v49
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v50
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v51
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
+; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
+; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; GCN-NEXT:    v_mov_b32_e32 v0, v34
+; GCN-NEXT:    v_mov_b32_e32 v2, v35
+; GCN-NEXT:    v_mov_b32_e32 v4, v36
+; GCN-NEXT:    v_mov_b32_e32 v6, v37
+; GCN-NEXT:    v_mov_b32_e32 v8, v39
+; GCN-NEXT:    v_mov_b32_e32 v10, v40
+; GCN-NEXT:    v_mov_b32_e32 v12, v41
+; GCN-NEXT:    v_mov_b32_e32 v14, v42
+; GCN-NEXT:    v_mov_b32_e32 v16, v48
+; GCN-NEXT:    v_mov_b32_e32 v18, v49
+; GCN-NEXT:    v_mov_b32_e32 v20, v50
+; GCN-NEXT:    v_mov_b32_e32 v22, v51
+; GCN-NEXT:    v_mov_b32_e32 v24, v52
+; GCN-NEXT:    v_mov_b32_e32 v26, v53
+; GCN-NEXT:    v_mov_b32_e32 v28, v54
+; GCN-NEXT:    v_mov_b32_e32 v30, v55
+; GCN-NEXT:    v_mov_b32_e32 v1, v38
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[38:41], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
+; GFX7-NEXT:    v_mov_b32_e32 v4, v40
+; GFX7-NEXT:    v_mov_b32_e32 v6, v41
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; GFX7-NEXT:    v_mov_b32_e32 v0, v38
+; GFX7-NEXT:    v_mov_b32_e32 v2, v39
+; GFX7-NEXT:    v_mov_b32_e32 v8, v48
+; GFX7-NEXT:    v_mov_b32_e32 v10, v49
+; GFX7-NEXT:    v_mov_b32_e32 v12, v50
+; GFX7-NEXT:    v_mov_b32_e32 v14, v51
+; GFX7-NEXT:    v_mov_b32_e32 v16, v34
+; GFX7-NEXT:    v_mov_b32_e32 v18, v35
+; GFX7-NEXT:    v_mov_b32_e32 v20, v36
+; GFX7-NEXT:    v_mov_b32_e32 v22, v37
+; GFX7-NEXT:    v_mov_b32_e32 v24, v52
+; GFX7-NEXT:    v_mov_b32_e32 v26, v53
+; GFX7-NEXT:    v_mov_b32_e32 v28, v54
+; GFX7-NEXT:    v_mov_b32_e32 v30, v55
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[22:25], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[18:21], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GFX8-NEXT:    v_mov_b32_e32 v0, v22
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
+; GFX8-NEXT:    v_mov_b32_e32 v2, v23
+; GFX8-NEXT:    v_mov_b32_e32 v4, v24
+; GFX8-NEXT:    v_mov_b32_e32 v6, v25
+; GFX8-NEXT:    v_mov_b32_e32 v8, v18
+; GFX8-NEXT:    v_mov_b32_e32 v10, v19
+; GFX8-NEXT:    v_mov_b32_e32 v12, v20
+; GFX8-NEXT:    v_mov_b32_e32 v14, v21
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
+; GFX9-NEXT:    v_mov_b32_e32 v0, v22
+; GFX9-NEXT:    v_mov_b32_e32 v2, v23
+; GFX9-NEXT:    v_mov_b32_e32 v4, v24
+; GFX9-NEXT:    v_mov_b32_e32 v6, v25
+; GFX9-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-NEXT:    v_mov_b32_e32 v12, v20
+; GFX9-NEXT:    v_mov_b32_e32 v14, v21
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
+; GFX10-NEXT:    v_mov_b32_e32 v0, v22
+; GFX10-NEXT:    v_mov_b32_e32 v2, v23
+; GFX10-NEXT:    v_mov_b32_e32 v4, v24
+; GFX10-NEXT:    v_mov_b32_e32 v6, v25
+; GFX10-NEXT:    v_mov_b32_e32 v8, v18
+; GFX10-NEXT:    v_mov_b32_e32 v10, v19
+; GFX10-NEXT:    v_mov_b32_e32 v12, v20
+; GFX10-NEXT:    v_mov_b32_e32 v14, v21
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  ret <32 x bfloat> %load
+}
+
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
+; GCN-NEXT:    buffer_store_dword v22, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v23, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v24, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 16, v0
+; GCN-NEXT:    s_waitcnt expcnt(2)
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v0
+; GCN-NEXT:    s_waitcnt expcnt(1)
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    buffer_store_dword v25, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 28, v0
+; GCN-NEXT:    buffer_store_dword v26, v22, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 32, v0
+; GCN-NEXT:    buffer_store_dword v27, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v28, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
+; GCN-NEXT:    s_waitcnt expcnt(4)
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 44, v0
+; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt expcnt(2)
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 52, v0
+; GCN-NEXT:    s_waitcnt expcnt(1)
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 56, v0
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    buffer_store_dword v29, v22, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 60, v0
+; GCN-NEXT:    buffer_store_dword v30, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 64, v0
+; GCN-NEXT:    buffer_store_dword v31, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v21, 0x44
+; GCN-NEXT:    buffer_store_dword v32, v24, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v24, 0x48
+; GCN-NEXT:    s_waitcnt expcnt(4)
+; GCN-NEXT:    v_mov_b32_e32 v28, 0x4c
+; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    v_mov_b32_e32 v29, 0x50
+; GCN-NEXT:    s_waitcnt expcnt(2)
+; GCN-NEXT:    v_mov_b32_e32 v30, 0x54
+; GCN-NEXT:    s_waitcnt expcnt(1)
+; GCN-NEXT:    v_mov_b32_e32 v31, 0x58
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    buffer_store_dword v17, v25, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v17, 0x5c
+; GCN-NEXT:    buffer_store_dword v18, v26, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v18, 0x60
+; GCN-NEXT:    buffer_store_dword v19, v27, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v19, 0x64
+; GCN-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v20, 0x68
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x74, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, v0, v21
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, v0, v24
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, v0, v28
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, v0, v29
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, v0, v30
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, v0, v31
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, v0, v17
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, v0, v18
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, v0, v19
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, v0, v20
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; GCN-NEXT:    buffer_store_dword v13, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v29, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v30, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v31, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v17, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    buffer_store_dword v5, v18, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v64bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
+; GFX7-NEXT:    buffer_store_dword v22, v21, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v23, v21, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 12, v0
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v24, v21, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v26, v23, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v27, v23, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 28, v0
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v28, v23, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    buffer_store_dword v29, v26, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 40, v0
+; GFX7-NEXT:    v_mov_b32_e32 v21, 0x44
+; GFX7-NEXT:    v_mov_b32_e32 v22, 0x48
+; GFX7-NEXT:    v_mov_b32_e32 v23, 0x4c
+; GFX7-NEXT:    v_mov_b32_e32 v24, 0x50
+; GFX7-NEXT:    v_mov_b32_e32 v25, 0x54
+; GFX7-NEXT:    buffer_store_dword v30, v27, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v31, v26, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v32, v27, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 52, v0
+; GFX7-NEXT:    v_add_i32_e32 v28, vcc, 56, v0
+; GFX7-NEXT:    v_add_i32_e32 v29, vcc, 60, v0
+; GFX7-NEXT:    v_add_i32_e32 v30, vcc, 64, v0
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v0, v21
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v0, v22
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v0, v23
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, v0, v24
+; GFX7-NEXT:    v_add_i32_e32 v25, vcc, v0, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v14, v27, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v15, v28, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v16, v29, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v30, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v18, v21, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v19, v22, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v20, v23, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v9, v24, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v10, v25, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x58
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT:    buffer_store_dword v11, v9, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x5c
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x60
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x64
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x68
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v7, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
+; GFX7-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; GFX7-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; GFX7-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v64bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT:    flat_load_dwordx4 v[38:41], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[48:51], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[34:37], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx4 v[52:55], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
+; GFX8-NEXT:    v_mov_b32_e32 v4, v40
+; GFX8-NEXT:    v_mov_b32_e32 v6, v41
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
+; GFX8-NEXT:    v_mov_b32_e32 v0, v38
+; GFX8-NEXT:    v_mov_b32_e32 v2, v39
+; GFX8-NEXT:    v_mov_b32_e32 v8, v48
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; GFX8-NEXT:    v_mov_b32_e32 v10, v49
+; GFX8-NEXT:    v_mov_b32_e32 v12, v50
+; GFX8-NEXT:    v_mov_b32_e32 v14, v51
+; GFX8-NEXT:    v_mov_b32_e32 v16, v34
+; GFX8-NEXT:    v_mov_b32_e32 v18, v35
+; GFX8-NEXT:    v_mov_b32_e32 v20, v36
+; GFX8-NEXT:    v_mov_b32_e32 v22, v37
+; GFX8-NEXT:    v_mov_b32_e32 v24, v52
+; GFX8-NEXT:    v_mov_b32_e32 v26, v53
+; GFX8-NEXT:    v_mov_b32_e32 v28, v54
+; GFX8-NEXT:    v_mov_b32_e32 v30, v55
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v64bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    global_load_dwordx4 v[38:41], v[0:1], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[34:37], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
+; GFX9-NEXT:    v_mov_b32_e32 v4, v40
+; GFX9-NEXT:    v_mov_b32_e32 v6, v41
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; GFX9-NEXT:    v_mov_b32_e32 v0, v38
+; GFX9-NEXT:    v_mov_b32_e32 v2, v39
+; GFX9-NEXT:    v_mov_b32_e32 v8, v48
+; GFX9-NEXT:    v_mov_b32_e32 v10, v49
+; GFX9-NEXT:    v_mov_b32_e32 v12, v50
+; GFX9-NEXT:    v_mov_b32_e32 v14, v51
+; GFX9-NEXT:    v_mov_b32_e32 v16, v34
+; GFX9-NEXT:    v_mov_b32_e32 v18, v35
+; GFX9-NEXT:    v_mov_b32_e32 v20, v36
+; GFX9-NEXT:    v_mov_b32_e32 v22, v37
+; GFX9-NEXT:    v_mov_b32_e32 v24, v52
+; GFX9-NEXT:    v_mov_b32_e32 v26, v53
+; GFX9-NEXT:    v_mov_b32_e32 v28, v54
+; GFX9-NEXT:    v_mov_b32_e32 v30, v55
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v64bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[64:67], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[34:37], v[0:1], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v64
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v65
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v66
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v67
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; GFX10-NEXT:    v_mov_b32_e32 v0, v64
+; GFX10-NEXT:    v_mov_b32_e32 v2, v65
+; GFX10-NEXT:    v_mov_b32_e32 v4, v66
+; GFX10-NEXT:    v_mov_b32_e32 v6, v67
+; GFX10-NEXT:    v_mov_b32_e32 v8, v48
+; GFX10-NEXT:    v_mov_b32_e32 v10, v49
+; GFX10-NEXT:    v_mov_b32_e32 v12, v50
+; GFX10-NEXT:    v_mov_b32_e32 v14, v51
+; GFX10-NEXT:    v_mov_b32_e32 v16, v34
+; GFX10-NEXT:    v_mov_b32_e32 v18, v35
+; GFX10-NEXT:    v_mov_b32_e32 v20, v36
+; GFX10-NEXT:    v_mov_b32_e32 v22, v37
+; GFX10-NEXT:    v_mov_b32_e32 v24, v52
+; GFX10-NEXT:    v_mov_b32_e32 v26, v53
+; GFX10-NEXT:    v_mov_b32_e32 v28, v54
+; GFX10-NEXT:    v_mov_b32_e32 v30, v55
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+  ret <64 x bfloat> %load
+}
+
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[4:5], v6
+; GFX8-NEXT:    flat_store_short v[2:3], v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v10
+; GCN-NEXT:    v_or_b32_e32 v4, v4, v5
+; GCN-NEXT:    v_or_b32_e32 v5, v6, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GCN-NEXT:    v_or_b32_e32 v6, v6, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v18
+; GCN-NEXT:    v_or_b32_e32 v8, v8, v9
+; GCN-NEXT:    v_or_b32_e32 v9, v10, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v20
+; GCN-NEXT:    v_or_b32_e32 v10, v10, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v24
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v27
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v26
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v28
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff, v30
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
+; GCN-NEXT:    v_or_b32_e32 v12, v13, v14
+; GCN-NEXT:    v_or_b32_e32 v13, v15, v16
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32
+; GCN-NEXT:    v_or_b32_e32 v14, v17, v18
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v10
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v18
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v20
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v24
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v27
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v26
+; GFX7-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX7-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX7-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v28
+; GFX7-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff, v30
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT:    v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v12, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <32 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v12
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v14
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v17
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v16
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v19
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v25
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v24
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v26
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_or_b32_e32 v0, v6, v7
+; GCN-NEXT:    v_or_b32_e32 v1, v8, v9
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_or_b32_e32 v2, v10, v11
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_or_b32_e32 v3, v12, v13
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v28
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_or_b32_e32 v0, v14, v15
+; GCN-NEXT:    v_or_b32_e32 v1, v16, v17
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_or_b32_e32 v2, v12, v13
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v30
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GCN-NEXT:    v_or_b32_e32 v3, v16, v3
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v19
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v7
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v9
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; GCN-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v21
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v15
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v6
+; GCN-NEXT:    v_or_b32_e32 v6, v7, v8
+; GCN-NEXT:    v_or_b32_e32 v7, v10, v11
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v16
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v13
+; GCN-NEXT:    v_or_b32_e32 v8, v8, v10
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v18
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v17
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v22
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff, v23
+; GCN-NEXT:    v_or_b32_e32 v9, v10, v11
+; GCN-NEXT:    v_or_b32_e32 v10, v16, v17
+; GCN-NEXT:    v_or_b32_e32 v11, v18, v19
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:108
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v24
+; GCN-NEXT:    v_or_b32_e32 v12, v12, v18
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GCN-NEXT:    v_or_b32_e32 v13, v13, v14
+; GCN-NEXT:    v_or_b32_e32 v14, v16, v15
+; GCN-NEXT:    v_or_b32_e32 v15, v18, v17
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[4:5], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[4:5], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v64bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v35, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v37, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v38, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v31, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; GFX7-NEXT:    v_or_b32_e32 v36, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GFX7-NEXT:    v_or_b32_e32 v33, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; GFX7-NEXT:    v_or_b32_e32 v32, v2, v3
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_or_b32_e32 v34, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v25
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v26
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v28
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v30
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX7-NEXT:    v_or_b32_e32 v9, v9, v13
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT:    v_or_b32_e32 v10, v12, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    buffer_store_dwordx4 v[35:38], v[24:25], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_store_dwordx4 v[31:34], v[24:25], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_or_b32_e32 v11, v12, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v17
+; GFX7-NEXT:    v_or_b32_e32 v13, v13, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v19
+; GFX7-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v20
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff, v22
+; GFX7-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v27
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff, v26
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v29
+; GFX7-NEXT:    v_or_b32_e32 v18, v18, v23
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX7-NEXT:    v_or_b32_e32 v19, v20, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v26
+; GFX7-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v28
+; GFX7-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_or_b32_e32 v23, v23, v26
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[24:25], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[24:25], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[24:25], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    buffer_store_dwordx4 v[12:15], v[24:25], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    buffer_store_dwordx4 v[16:19], v[24:25], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    buffer_store_dwordx4 v[20:23], v[24:25], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v64bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX8-NEXT:    v_mov_b32_sdwa v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX8-NEXT:    v_mov_b32_sdwa v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; GFX8-NEXT:    v_mov_b32_sdwa v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; GFX8-NEXT:    v_mov_b32_sdwa v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX8-NEXT:    v_mov_b32_sdwa v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
+; GFX8-NEXT:    v_mov_b32_sdwa v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX8-NEXT:    v_mov_b32_sdwa v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; GFX8-NEXT:    v_mov_b32_sdwa v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX8-NEXT:    v_mov_b32_sdwa v12, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_mov_b32_sdwa v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_mov_b32_sdwa v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_mov_b32_sdwa v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v32
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX8-NEXT:    v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX8-NEXT:    v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; GFX8-NEXT:    v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX8-NEXT:    v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX8-NEXT:    v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX8-NEXT:    v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GFX8-NEXT:    v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX8-NEXT:    v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; GFX8-NEXT:    v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX8-NEXT:    v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; GFX8-NEXT:    v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX8-NEXT:    v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GFX8-NEXT:    v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX8-NEXT:    v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
+; GFX8-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v32
+; GFX8-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x60, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x70, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[28:31]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v64bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v0
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX9-NEXT:    v_mov_b32_sdwa v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX9-NEXT:    v_mov_b32_sdwa v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
+; GFX9-NEXT:    v_mov_b32_sdwa v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_mov_b32_sdwa v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX9-NEXT:    v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX9-NEXT:    v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-NEXT:    v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX9-NEXT:    v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX9-NEXT:    v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GFX9-NEXT:    v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX9-NEXT:    v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; GFX9-NEXT:    v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX9-NEXT:    v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; GFX9-NEXT:    v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX9-NEXT:    v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GFX9-NEXT:    v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX9-NEXT:    v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v64bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v23
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT:    v_mov_b32_sdwa v19, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v12, v54 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v13, v55 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v14, v64 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v15, v65 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v16, v66 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v17, v67 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v18, v68 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v20, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v21, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v22, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v23, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v24, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v25, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v26, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v27, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v29, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v30, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
+; GFX10-NEXT:    v_mov_b32_sdwa v31, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <64 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_store_fpimm:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_store_fpimm:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_store_fpimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT:    global_store_short v[0:1], v4, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_store_fpimm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT:    global_store_short v[0:1], v4, off
+; GFX10-NEXT:    global_store_short v[2:3], v5, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat 1.0, ptr addrspace(1) %ptr0
+  store bfloat 42.0, ptr addrspace(1) %ptr1
+  ret void
+}
+
+; FIXME: unable to translate instruction: fptrunc
+; define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;   %val = load float, ptr addrspace(1) %in
+;   %val.bf16 = fptrunc float %val to bfloat
+;   store bfloat %val.bf16, ptr addrspace(1) %out
+;   ret void
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;   %val = load double, ptr addrspace(1) %in
+;   %val.bf16 = fptrunc double %val to bfloat
+;   store bfloat %val.bf16, ptr addrspace(1) %out
+;   ret void
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;   %val = load bfloat, ptr addrspace(1) %in
+;   %val.f32 = fpext bfloat %val to float
+;   store float %val.f32, ptr addrspace(1) %out
+;   ret void
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;   %val = load bfloat, ptr addrspace(1) %in
+;   %val.f64 = fpext bfloat %val to double
+;   store double %val.f64, ptr addrspace(1) %out
+;   ret void
+; }
+
+define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x bfloat>, ptr addrspace(1) %in
+  store <2 x bfloat> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <4 x bfloat>, ptr addrspace(1) %in
+  store <4 x bfloat> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <8 x bfloat>, ptr addrspace(1) %in
+  store <8 x bfloat> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <16 x bfloat>, ptr addrspace(1) %in
+  store <16 x bfloat> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_short v[1:2], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v[1:2], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_short v[1:2], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
+; GCN-LABEL: test_arg_store_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[4:5], v6
+; GFX8-NEXT:    flat_store_short v[2:3], v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
+  ret void
+}
+
+define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x bfloat> %in, ptr addrspace(1)  %out
+  ret void
+}
+
+define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_inreg_arg_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    s_mov_b32 s38, 0
+; GCN-NEXT:    s_mov_b32 s39, 0xf000
+; GCN-NEXT:    s_mov_b64 s[36:37], 0
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_inreg_arg_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b32 s38, 0
+; GFX7-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[36:37], 0
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_inreg_arg_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_inreg_arg_store:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_inreg_arg_store:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
+; GCN-LABEL: test_byval:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_byval:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_byval:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_byval:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_byval:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %val, ptr addrspace(5) %bv
+  %retval = load bfloat, ptr addrspace(5) %bv
+  ret bfloat %retval
+}
+
+define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
+; GCN-LABEL: test_sret:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_sret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_sret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_sret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_sret:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %val, ptr addrspace(5) %sret
+  ret void
+}
+
+define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_bitcast_from_bfloat:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_bitcast_from_bfloat:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_bitcast_from_bfloat:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_bitcast_from_bfloat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_bitcast_from_bfloat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) %in
+  %val_int = bitcast bfloat %val to i16
+  store i16 %val_int, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: test_bitcast_to_bfloat:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_bitcast_to_bfloat:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_bitcast_to_bfloat:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_bitcast_to_bfloat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_bitcast_to_bfloat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %val = load i16, ptr addrspace(1) %in
+  %val_fp = bitcast i16 %val to bfloat
+  store bfloat %val_fp, ptr addrspace(1) %out
+  ret void
+}
+
+define bfloat @test_ret(bfloat %in) {
+; GCN-LABEL: test_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret bfloat %in
+}
+
+define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
+; GCN-LABEL: test_ret_v2bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret <2 x bfloat> %in
+}
+
+define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
+; GCN-LABEL: test_ret_v3bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret <3 x bfloat> %in
+}
+
+define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
+; GCN-LABEL: test_ret_v4bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret <4 x bfloat> %in
+}
+
+define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
+; GCN-LABEL: test_ret_v8bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret <8 x bfloat> %in
+}
+
+define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
+; GCN-LABEL: test_ret_v16bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret <16 x bfloat> %in
+}
+
+define void @test_call(bfloat %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v2, s30, 0
+; GCN-NEXT:    v_writelane_b32 v2, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
+; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call bfloat @test_arg_store(bfloat %in)
+  store volatile bfloat %result, ptr addrspace(5) %out
+  ret void
+}
+
+define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v2bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v3, s30, 0
+; GCN-NEXT:    v_writelane_b32 v3, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v3, 1
+; GCN-NEXT:    v_readlane_b32 s30, v3, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
+  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+  ret void
+}
+
+define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v3bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v4, s30, 0
+; GCN-NEXT:    v_writelane_b32 v4, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
+; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v3
+; GFX7-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
+  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+  ret void
+}
+
+define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v4bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v5, s30, 0
+; GCN-NEXT:    v_writelane_b32 v5, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
+; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
+; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
+  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+  ret void
+}
+
+define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v8bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v9, s30, 0
+; GCN-NEXT:    v_writelane_b32 v9, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 4, v8
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 8, v8
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 12, v8
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, v10, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, v12, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v9, 1
+; GCN-NEXT:    v_readlane_b32 s30, v9, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v8
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v8
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v8
+; GFX7-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v4
+; GFX8-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
+  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+  ret void
+}
+
+define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v16bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v17, s30, 0
+; GCN-NEXT:    v_writelane_b32 v17, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 4, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 8, v16
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 28, v16
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NEXT:    buffer_store_dword v0, v16, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, v18, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, v19, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, v20, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v5, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v23, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v7, v24, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v17, 1
+; GCN-NEXT:    v_readlane_b32 s30, v17, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT:    buffer_store_dword v0, v16, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v16
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v16
+; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v16
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 16, v16
+; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 20, v16
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 24, v16
+; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 28, v16
+; GFX7-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, v10
+; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 20, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 24, v8
+; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 28, v8
+; GFX8-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, v3
+; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v10
+; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
+  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+  ret void
+}
+
+define bfloat @test_alloca_load_store_ret(bfloat %in) {
+; GCN-LABEL: test_alloca_load_store_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_alloca_load_store_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_alloca_load_store_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_alloca_load_store_ret:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_alloca_load_store_ret:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %in.addr = alloca bfloat, align 2, addrspace(5)
+  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
+  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
+  ret bfloat %loaded
+}
+
+define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
+; GCN-LABEL: test_overflow_stack:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; GCN-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; GCN-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; GCN-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
+; GCN-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
+; GCN-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x44
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x48
+; GCN-NEXT:    buffer_store_dword v18, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x4c
+; GCN-NEXT:    v_mov_b32_e32 v8, 0x50
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x54
+; GCN-NEXT:    v_mov_b32_e32 v9, 0x58
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
+; GCN-NEXT:    buffer_store_dword v20, v7, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x5c
+; GCN-NEXT:    v_mov_b32_e32 v10, 0x60
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
+; GCN-NEXT:    buffer_store_dword v21, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x64
+; GCN-NEXT:    v_mov_b32_e32 v11, 0x68
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v0, v8
+; GCN-NEXT:    buffer_store_dword v22, v8, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v8, 0x6c
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 0x74, v0
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
+; GCN-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x80, v0
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v0, v10
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, v0, v11
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    buffer_store_dword v25, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v26, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v27, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v30, v12, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_overflow_stack:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; GFX7-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
+; GFX7-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
+; GFX7-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 64, v0
+; GFX7-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x44
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x48
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x4c
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x54
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x58
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x60
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x64
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x68
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x6c
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT:    buffer_store_dword v29, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v30, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x74, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x7c, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_overflow_stack:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
+; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
+; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 48, v0
+; GFX8-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 52, v0
+; GFX8-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
+; GFX8-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
+; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x44
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x48
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4c
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x54
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x58
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x60
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x64
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x68
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x6c
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT:    buffer_store_dword v29, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v30, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x74, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
+; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7c, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_overflow_stack:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_overflow_stack:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
+  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
+  ret { <32 x i32>, bfloat } %ins.1
+}
+
+; FIXME: unable to translate instruction: fpext
+; define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
+;   %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <2 x bfloat> %load to <2 x float>
+;   ret <2 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
+;   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <3 x bfloat> %load to <3 x float>
+;   ret <3 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
+;   %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <4 x bfloat> %load to <4 x float>
+;   ret <4 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
+;   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <5 x bfloat> %load to <5 x float>
+;   ret <5 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
+;   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <6 x bfloat> %load to <6 x float>
+;   ret <6 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
+;   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <8 x bfloat> %load to <8 x float>
+;   ret <8 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
+;   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <16 x bfloat> %load to <16 x float>
+;   ret <16 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
+;   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <32 x bfloat> %load to <32 x float>
+;   ret <32 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
+;   %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <2 x bfloat> %load to <2 x double>
+;   ret <2 x double> %fpext
+; }
+
+; define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
+;   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <3 x bfloat> %load to <3 x double>
+;   ret <3 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
+;   %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <4 x bfloat> %load to <4 x double>
+;   ret <4 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
+;   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <5 x bfloat> %load to <5 x double>
+;   ret <5 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
+;   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <6 x bfloat> %load to <6 x double>
+;   ret <6 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
+;   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <8 x bfloat> %load to <8 x double>
+;   ret <8 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
+;   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <16 x bfloat> %load to <16 x double>
+;   ret <16 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
+;   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+;   %fpext = fpext <32 x bfloat> %load to <32 x double>
+;   ret <32 x double> %fpext
+; }
+
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fadd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v6, v0, v4
+; GFX8-NEXT:    v_add_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v2, v1, v5
+; GFX8-NEXT:    v_add_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_add_f16 v2, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_add_f16 v2, v1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v17
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v18
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v19
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v18
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v12, v0, v8
+; GFX8-NEXT:    v_add_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v13, v1, v9
+; GFX8-NEXT:    v_add_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v4, v2, v10
+; GFX8-NEXT:    v_add_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v6, v3, v11
+; GFX8-NEXT:    v_add_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v12
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v8
+; GFX9-NEXT:    v_pk_add_f16 v8, v1, v9
+; GFX9-NEXT:    v_pk_add_f16 v4, v2, v10
+; GFX9-NEXT:    v_pk_add_f16 v6, v3, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v8
+; GFX10-NEXT:    v_pk_add_f16 v8, v1, v9
+; GFX10-NEXT:    v_pk_add_f16 v4, v2, v10
+; GFX10-NEXT:    v_pk_add_f16 v6, v3, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_add_f32_e32 v17, v17, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_add_f32_e32 v19, v19, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_add_f32_e32 v21, v21, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_add_f32_e32 v23, v23, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_add_f32_e32 v25, v25, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_add_f32_e32 v27, v27, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_add_f32_e32 v29, v29, v31
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_add_f32_e32 v30, v30, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT:    v_add_f32_e32 v31, v31, v32
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v16, v16, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v17, v17, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v18, v18, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v19, v19, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v20, v20, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v21, v21, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v22, v22, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v23, v23, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v24, v24, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v25, v25, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v26, v26, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v27, v27, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v28, v28, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v29, v29, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v30, v30, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_add_f32_e32 v31, v31, v32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v24, v0, v16
+; GFX8-NEXT:    v_add_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v25, v1, v17
+; GFX8-NEXT:    v_add_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v26, v2, v18
+; GFX8-NEXT:    v_add_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v27, v3, v19
+; GFX8-NEXT:    v_add_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v8, v4, v20
+; GFX8-NEXT:    v_add_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v10, v5, v21
+; GFX8-NEXT:    v_add_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v12, v6, v22
+; GFX8-NEXT:    v_add_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v14, v7, v23
+; GFX8-NEXT:    v_add_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v24
+; GFX8-NEXT:    v_mov_b32_e32 v1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v2, v25
+; GFX8-NEXT:    v_mov_b32_e32 v3, v17
+; GFX8-NEXT:    v_mov_b32_e32 v4, v26
+; GFX8-NEXT:    v_mov_b32_e32 v5, v18
+; GFX8-NEXT:    v_mov_b32_e32 v6, v27
+; GFX8-NEXT:    v_mov_b32_e32 v7, v19
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v16
+; GFX9-NEXT:    v_pk_add_f16 v16, v1, v17
+; GFX9-NEXT:    v_pk_add_f16 v18, v2, v18
+; GFX9-NEXT:    v_pk_add_f16 v17, v3, v19
+; GFX9-NEXT:    v_pk_add_f16 v8, v4, v20
+; GFX9-NEXT:    v_pk_add_f16 v10, v5, v21
+; GFX9-NEXT:    v_pk_add_f16 v12, v6, v22
+; GFX9-NEXT:    v_pk_add_f16 v14, v7, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, v18
+; GFX9-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v16
+; GFX10-NEXT:    v_pk_add_f16 v16, v1, v17
+; GFX10-NEXT:    v_pk_add_f16 v18, v2, v18
+; GFX10-NEXT:    v_pk_add_f16 v17, v3, v19
+; GFX10-NEXT:    v_pk_add_f16 v8, v4, v20
+; GFX10-NEXT:    v_pk_add_f16 v10, v5, v21
+; GFX10-NEXT:    v_pk_add_f16 v12, v6, v22
+; GFX10-NEXT:    v_pk_add_f16 v14, v7, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_mov_b32_e32 v2, v16
+; GFX10-NEXT:    v_mov_b32_e32 v4, v18
+; GFX10-NEXT:    v_mov_b32_e32 v6, v17
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, 0x3f80
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, 0x3f80
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 1.0
+  ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, 0x4228
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, 0x4228
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_e32 v0, 0x4228, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f16_e32 v0, 0x4228, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_f16_e32 v0, 0x4228, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 42.0
+  ret bfloat %add
+}
+
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fsub_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT:    v_add_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e64 v3, -v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e64 v4, -v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v4
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e64 v4, -v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e64 v6, -v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e64 v7, -v7
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v5, -v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v5, -v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_f16_e32 v3, v0, v2
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fmul_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v4
+; GFX8-NEXT:    v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v5
+; GFX8-NEXT:    v_mul_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_mul_f16 v2, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_mul_f16 v2, v1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v17
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v18
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v19
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v18
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v12, v0, v8
+; GFX8-NEXT:    v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v13, v1, v9
+; GFX8-NEXT:    v_mul_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v10
+; GFX8-NEXT:    v_mul_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v11
+; GFX8-NEXT:    v_mul_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v12
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v8
+; GFX9-NEXT:    v_pk_mul_f16 v8, v1, v9
+; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v10
+; GFX9-NEXT:    v_pk_mul_f16 v6, v3, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v8
+; GFX10-NEXT:    v_pk_mul_f16 v8, v1, v9
+; GFX10-NEXT:    v_pk_mul_f16 v4, v2, v10
+; GFX10-NEXT:    v_pk_mul_f16 v6, v3, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_mul_f32_e32 v17, v17, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_mul_f32_e32 v19, v19, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_mul_f32_e32 v21, v21, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_mul_f32_e32 v23, v23, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_mul_f32_e32 v25, v25, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_mul_f32_e32 v27, v27, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_mul_f32_e32 v29, v29, v31
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_mul_f32_e32 v30, v30, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT:    v_mul_f32_e32 v31, v31, v32
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_mul_f32_e32 v31, v31, v32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v24, v0, v16
+; GFX8-NEXT:    v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v25, v1, v17
+; GFX8-NEXT:    v_mul_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v26, v2, v18
+; GFX8-NEXT:    v_mul_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v27, v3, v19
+; GFX8-NEXT:    v_mul_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v8, v4, v20
+; GFX8-NEXT:    v_mul_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v10, v5, v21
+; GFX8-NEXT:    v_mul_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v12, v6, v22
+; GFX8-NEXT:    v_mul_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v14, v7, v23
+; GFX8-NEXT:    v_mul_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v24
+; GFX8-NEXT:    v_mov_b32_e32 v1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v2, v25
+; GFX8-NEXT:    v_mov_b32_e32 v3, v17
+; GFX8-NEXT:    v_mov_b32_e32 v4, v26
+; GFX8-NEXT:    v_mov_b32_e32 v5, v18
+; GFX8-NEXT:    v_mov_b32_e32 v6, v27
+; GFX8-NEXT:    v_mov_b32_e32 v7, v19
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v16
+; GFX9-NEXT:    v_pk_mul_f16 v16, v1, v17
+; GFX9-NEXT:    v_pk_mul_f16 v18, v2, v18
+; GFX9-NEXT:    v_pk_mul_f16 v17, v3, v19
+; GFX9-NEXT:    v_pk_mul_f16 v8, v4, v20
+; GFX9-NEXT:    v_pk_mul_f16 v10, v5, v21
+; GFX9-NEXT:    v_pk_mul_f16 v12, v6, v22
+; GFX9-NEXT:    v_pk_mul_f16 v14, v7, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, v18
+; GFX9-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v16
+; GFX10-NEXT:    v_pk_mul_f16 v16, v1, v17
+; GFX10-NEXT:    v_pk_mul_f16 v18, v2, v18
+; GFX10-NEXT:    v_pk_mul_f16 v17, v3, v19
+; GFX10-NEXT:    v_pk_mul_f16 v8, v4, v20
+; GFX10-NEXT:    v_pk_mul_f16 v10, v5, v21
+; GFX10-NEXT:    v_pk_mul_f16 v12, v6, v22
+; GFX10-NEXT:    v_pk_mul_f16 v14, v7, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_mov_b32_e32 v2, v16
+; GFX10-NEXT:    v_mov_b32_e32 v4, v18
+; GFX10-NEXT:    v_mov_b32_e32 v6, v17
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fdiv_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GCN-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
+; GCN-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GCN-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GCN-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GCN-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GCN-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fdiv_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fdiv bfloat %a, %b
+  ret bfloat %op
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define bfloat @v_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define bfloat @v_fneg_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fneg bfloat %a
+  ret bfloat %op
+}
+
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %op = fneg bfloat %a
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  ret bfloat %op
+}
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bitset1_b32 s0, 15
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_bitset1_b32 s0, 15
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bitset1_b32 s0, 15
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bitset1_b32 s0, 15
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bitset1_b32 s0, 15
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_minnum_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v3, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v1, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT:    v_min_f16_e32 v3, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v1, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX8-NEXT:    v_min_f16_e32 v6, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v4, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v2, v5, v5
+; GFX8-NEXT:    v_min_f16_e32 v2, v0, v2
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v5, v5
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v17
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v18
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v19
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v18
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v5, v8, v8
+; GFX8-NEXT:    v_min_f16_e32 v12, v4, v5
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v8, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v4, v9, v9
+; GFX8-NEXT:    v_min_f16_e32 v13, v0, v4
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v9, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v10, v10
+; GFX8-NEXT:    v_min_f16_e32 v4, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v5, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v11, v11
+; GFX8-NEXT:    v_min_f16_e32 v6, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v12
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v4, v8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v4, v9, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v8, v1, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v10, v10
+; GFX9-NEXT:    v_pk_min_f16 v4, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v2, v11, v11
+; GFX9-NEXT:    v_pk_min_f16 v6, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v4, v8, v8
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v5, v9, v9
+; GFX10-NEXT:    v_pk_max_f16 v6, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v7, v10, v10
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_min_f16 v2, v1, v5
+; GFX10-NEXT:    v_pk_min_f16 v4, v6, v7
+; GFX10-NEXT:    v_pk_min_f16 v6, v3, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_min_f32_e32 v17, v17, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_min_f32_e32 v19, v19, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_min_f32_e32 v21, v21, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_min_f32_e32 v23, v23, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_min_f32_e32 v25, v25, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_min_f32_e32 v27, v27, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_min_f32_e32 v29, v29, v31
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_min_f32_e32 v30, v30, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT:    v_min_f32_e32 v31, v31, v32
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v16, v16, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v17, v17, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v18, v18, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v19, v19, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v20, v20, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v21, v21, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v22, v22, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v23, v23, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v24, v24, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v25, v25, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v26, v26, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v27, v27, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v28, v28, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v29, v29, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v30, v30, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v9, v16, v16
+; GFX8-NEXT:    v_min_f16_e32 v24, v8, v9
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v16, v0, v8
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v8, v17, v17
+; GFX8-NEXT:    v_min_f16_e32 v25, v0, v8
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v17, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v18, v18
+; GFX8-NEXT:    v_min_f16_e32 v26, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v18, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v19, v19
+; GFX8-NEXT:    v_min_f16_e32 v27, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v19, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v20, v20
+; GFX8-NEXT:    v_min_f16_e32 v8, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v9, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v1, v21, v21
+; GFX8-NEXT:    v_min_f16_e32 v10, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v11, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v1, v22, v22
+; GFX8-NEXT:    v_min_f16_e32 v12, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v13, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v1, v23, v23
+; GFX8-NEXT:    v_min_f16_e32 v14, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_e32 v15, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v24
+; GFX8-NEXT:    v_mov_b32_e32 v1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v2, v25
+; GFX8-NEXT:    v_mov_b32_e32 v3, v17
+; GFX8-NEXT:    v_mov_b32_e32 v4, v26
+; GFX8-NEXT:    v_mov_b32_e32 v5, v18
+; GFX8-NEXT:    v_mov_b32_e32 v6, v27
+; GFX8-NEXT:    v_mov_b32_e32 v7, v19
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v8, v16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v8, v17, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v16, v1, v8
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v18, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v18, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v2, v19, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v17, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v2, v20, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v8, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v21, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_min_f16 v10, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v2, v22, v22
+; GFX9-NEXT:    v_pk_min_f16 v12, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v2, v23, v23
+; GFX9-NEXT:    v_pk_min_f16 v14, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, v18
+; GFX9-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v8, v16, v16
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v9, v17, v17
+; GFX10-NEXT:    v_pk_max_f16 v10, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v11, v18, v18
+; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX10-NEXT:    v_pk_min_f16 v2, v1, v9
+; GFX10-NEXT:    v_pk_min_f16 v16, v10, v11
+; GFX10-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v19, v19
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v8, v20, v20
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v9, v21, v21
+; GFX10-NEXT:    v_pk_max_f16 v11, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v12, v22, v22
+; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT:    v_pk_max_f16 v13, v23, v23
+; GFX10-NEXT:    v_pk_min_f16 v6, v1, v3
+; GFX10-NEXT:    v_pk_min_f16 v8, v4, v8
+; GFX10-NEXT:    v_pk_min_f16 v10, v5, v9
+; GFX10-NEXT:    v_pk_min_f16 v12, v11, v12
+; GFX10-NEXT:    v_pk_min_f16 v14, v7, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_mov_b32_e32 v4, v16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
+}
+
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_maxnum_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v3, v1, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v1, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v3, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v6, v2, v3
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v2, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v0, v2
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v2, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v2, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v16
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v17
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v18
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v19
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v18
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v17
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v5, v8, v8
+; GFX8-NEXT:    v_max_f16_e32 v12, v4, v5
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v8, v0, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v4, v9, v9
+; GFX8-NEXT:    v_max_f16_e32 v13, v0, v4
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v9, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v10, v10
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v11, v11
+; GFX8-NEXT:    v_max_f16_e32 v6, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v12
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v4, v8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v4, v9, v9
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v8, v1, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v10, v10
+; GFX9-NEXT:    v_pk_max_f16 v4, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v2, v11, v11
+; GFX9-NEXT:    v_pk_max_f16 v6, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v4, v8, v8
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v5, v9, v9
+; GFX10-NEXT:    v_pk_max_f16 v6, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v7, v10, v10
+; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v5
+; GFX10-NEXT:    v_pk_max_f16 v4, v6, v7
+; GFX10-NEXT:    v_pk_max_f16 v6, v3, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_max_f32_e32 v17, v17, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_max_f32_e32 v19, v19, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_max_f32_e32 v21, v21, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_max_f32_e32 v23, v23, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_max_f32_e32 v25, v25, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_max_f32_e32 v27, v27, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_max_f32_e32 v29, v29, v31
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT:    v_max_f32_e32 v30, v30, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT:    v_max_f32_e32 v31, v31, v32
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v16, v16, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v17, v17, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v18, v18, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v19, v19, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v20, v20, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v21, v21, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v22, v22, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v23, v23, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v24, v24, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v25, v25, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v26, v26, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v27, v27, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v28, v28, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v29, v29, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v30, v30, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT:    v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-NEXT:    v_max_f16_e32 v9, v16, v16
+; GFX8-NEXT:    v_max_f16_e32 v24, v8, v9
+; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v16, v0, v8
+; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v8, v17, v17
+; GFX8-NEXT:    v_max_f16_e32 v25, v0, v8
+; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v17, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v18, v18
+; GFX8-NEXT:    v_max_f16_e32 v26, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v18, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v1, v19, v19
+; GFX8-NEXT:    v_max_f16_e32 v27, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v19, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v1, v20, v20
+; GFX8-NEXT:    v_max_f16_e32 v8, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v9, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v1, v21, v21
+; GFX8-NEXT:    v_max_f16_e32 v10, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v11, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v1, v22, v22
+; GFX8-NEXT:    v_max_f16_e32 v12, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v13, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v1, v23, v23
+; GFX8-NEXT:    v_max_f16_e32 v14, v0, v1
+; GFX8-NEXT:    v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v15, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v24
+; GFX8-NEXT:    v_mov_b32_e32 v1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v2, v25
+; GFX8-NEXT:    v_mov_b32_e32 v3, v17
+; GFX8-NEXT:    v_mov_b32_e32 v4, v26
+; GFX8-NEXT:    v_mov_b32_e32 v5, v18
+; GFX8-NEXT:    v_mov_b32_e32 v6, v27
+; GFX8-NEXT:    v_mov_b32_e32 v7, v19
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v8, v16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v8, v17, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v16, v1, v8
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT:    v_pk_max_f16 v2, v18, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v18, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v2, v19, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v17, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v4, v4
+; GFX9-NEXT:    v_pk_max_f16 v2, v20, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v8, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v21, v21
+; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_max_f16 v10, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v6, v6
+; GFX9-NEXT:    v_pk_max_f16 v2, v22, v22
+; GFX9-NEXT:    v_pk_max_f16 v12, v1, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v7, v7
+; GFX9-NEXT:    v_pk_max_f16 v2, v23, v23
+; GFX9-NEXT:    v_pk_max_f16 v14, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_mov_b32_e32 v2, v16
+; GFX9-NEXT:    v_mov_b32_e32 v4, v18
+; GFX9-NEXT:    v_mov_b32_e32 v6, v17
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT:    v_pk_max_f16 v8, v16, v16
+; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT:    v_pk_max_f16 v9, v17, v17
+; GFX10-NEXT:    v_pk_max_f16 v10, v2, v2
+; GFX10-NEXT:    v_pk_max_f16 v11, v18, v18
+; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX10-NEXT:    v_pk_max_f16 v2, v1, v9
+; GFX10-NEXT:    v_pk_max_f16 v16, v10, v11
+; GFX10-NEXT:    v_pk_max_f16 v1, v3, v3
+; GFX10-NEXT:    v_pk_max_f16 v3, v19, v19
+; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT:    v_pk_max_f16 v8, v20, v20
+; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT:    v_pk_max_f16 v9, v21, v21
+; GFX10-NEXT:    v_pk_max_f16 v11, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v12, v22, v22
+; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT:    v_pk_max_f16 v13, v23, v23
+; GFX10-NEXT:    v_pk_max_f16 v6, v1, v3
+; GFX10-NEXT:    v_pk_max_f16 v8, v4, v8
+; GFX10-NEXT:    v_pk_max_f16 v10, v5, v9
+; GFX10-NEXT:    v_pk_max_f16 v12, v11, v12
+; GFX10-NEXT:    v_pk_max_f16 v14, v7, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_mov_b32_e32 v4, v16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
+}
+
+declare bfloat @llvm.sqrt.bf16(bfloat)
+
+define bfloat @v_sqrt_bf16(bfloat %a) {
+; GCN-LABEL: v_sqrt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sqrt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sqrt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sqrt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sqrt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
+
+define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
+; GCN-LABEL: v_ldexp_bf16_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ldexp_bf16_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ldexp_bf16_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX8-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX8-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ldexp_bf16_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ldexp_bf16_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
+  ret bfloat %op
+}
+
+declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
+
+define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
+; GCN-LABEL: v_frexp_bf16_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GCN-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GCN-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
+; GCN-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_frexp_bf16_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_frexp_bf16_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_frexp_bf16_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_frexp_bf16_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX10-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
+  ret { bfloat, i16 } %op
+}
+
+
+declare bfloat @llvm.log.bf16(bfloat)
+declare bfloat @llvm.log2.bf16(bfloat)
+declare bfloat @llvm.log10.bf16(bfloat)
+
+define bfloat @v_log_bf16(bfloat %a) {
+; GCN-LABEL: v_log_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_log2_bf16(bfloat %a) {
+; GCN-LABEL: v_log2_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log2_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log2_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log2_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log2_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log2.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_log10_bf16(bfloat %a) {
+; GCN-LABEL: v_log10_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log10_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log10_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_log_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log10_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_log_f16_e32 v0, v0
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log10_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_log_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log10.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.exp.bf16(bfloat)
+declare bfloat @llvm.exp2.bf16(bfloat)
+declare bfloat @llvm.exp10.bf16(bfloat)
+
+define bfloat @v_exp_bf16(bfloat %a) {
+; GCN-LABEL: v_exp_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_exp2_bf16(bfloat %a) {
+; GCN-LABEL: v_exp2_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp2_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp2_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp2_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_exp_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp2_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_exp_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp2.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_exp10_bf16(bfloat %a) {
+; GCN-LABEL: v_exp10_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp10_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp10_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp10_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp10_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp10.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.ceil.bf16(bfloat)
+
+define bfloat @v_ceil_bf16(bfloat %a) {
+; GCN-LABEL: v_ceil_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_ceil_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ceil_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ceil_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_ceil_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ceil_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ceil_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ceil_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ceil_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.ceil.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.trunc.bf16(bfloat)
+
+define bfloat @v_trunc_bf16(bfloat %a) {
+; GCN-LABEL: v_trunc_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_trunc_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_trunc_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_trunc_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.trunc.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.rint.bf16(bfloat)
+
+define bfloat @v_rint_bf16(bfloat %a) {
+; GCN-LABEL: v_rint_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_rint_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rint_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rint_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rint_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.rint.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.nearbyint.bf16(bfloat)
+
+; FIXME: unable to legalize instruction: %2:_(s16) = G_FNEARBYINT %0:_
+; define bfloat @v_nearbyint_bf16(bfloat %a) {
+;   %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
+;   ret bfloat %op
+; }
+
+declare bfloat @llvm.round.bf16(bfloat)
+
+define bfloat @v_round_bf16(bfloat %a) {
+; GCN-LABEL: v_round_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, 0.5
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v4, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v4
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_round_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v2
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, 0.5
+; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_round_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX8-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX8-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_round_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_round_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.round.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.roundeven.bf16(bfloat)
+
+define bfloat @v_roundeven_bf16(bfloat %a) {
+; GCN-LABEL: v_roundeven_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_roundeven_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.floor.bf16(bfloat)
+
+define bfloat @v_floor_bf16(bfloat %a) {
+; GCN-LABEL: v_floor_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_floor_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_floor_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_floor_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_floor_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_floor_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_floor_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_floor_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_floor_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_floor_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.floor.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.canonicalize.bf16(bfloat)
+
+define bfloat @v_canonicalize_bf16(bfloat %a) {
+; GCN-LABEL: v_canonicalize_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_canonicalize_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_canonicalize_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_canonicalize_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_canonicalize_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
+
+; FIXME: Promotion broken
+; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
+;   %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
+;   ret bfloat %op
+; }
+
+define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_false_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_false_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_false_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_false_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_false_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp false bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oeq_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oeq_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oeq_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oeq_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oeq_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp oeq bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ogt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ogt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ogt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ogt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ogt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ogt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oge_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oge_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oge_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oge_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oge_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp oge bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_olt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_olt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_olt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_olt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_olt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp olt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ole_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ole_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ole_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ole_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ole_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ole bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_one_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_one_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_one_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lg_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_one_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lg_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_one_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp one bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uno_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uno_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uno_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_u_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uno_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_u_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uno_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp uno bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ueq_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ueq_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ueq_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_nlg_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ueq_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlg_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ueq_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ueq bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ugt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ugt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ugt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ugt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ugt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ugt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uge_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uge_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uge_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uge_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uge_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp uge bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ult_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ult_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ult_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_nge_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ult_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ult_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ult bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ule_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ule_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ule_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ule_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ule_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ule bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_une_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_une_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_une_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_neq_f16_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_une_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_une_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp une bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_true_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_true_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_true_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_true_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_true_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp true bfloat %a, %b
+  ret i1 %op
+}
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+; FIXME: unable to lower arguments: ptr
+; define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   ret bfloat %op
+; }
+
+; FIXME: unable to lower arguments: ptr
+; define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
+;   %sign = fptrunc float %sign.f32 to bfloat
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
+;   %sign = fptrunc double %sign.f64 to bfloat
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   ret bfloat %op
+; }
+
+define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
+; GCN-LABEL: v_copysign_bf16_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %sign = bitcast half %sign.f16 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+; FIXME: unable to translate instruction: fptrunc
+; define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
+;   %sign = fptrunc float %sign.f32 to bfloat
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   %cast = bitcast bfloat %op to i16
+;   %zext = zext i16 %cast to i32
+;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+;   ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
+;   %sign = fptrunc double %sign.f64 to bfloat
+;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+;   %cast = bitcast bfloat %op to i16
+;   %zext = zext i16 %cast to i32
+;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+;   ret i32 %readlane
+; }
+
+define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
+; GCN-LABEL: s_copysign_bf16_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %sign = bitcast half %sign.f16 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare float @llvm.copysign.f32(float, float)
+
+; FIXME: unable to translate instruction: fpext
+; define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
+;   %sign = fpext bfloat %sign.bf16 to float
+;   %op = call float @llvm.copysign.f32(float %mag, float %sign)
+;   ret float %op
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
+;   %sign = fpext bfloat %sign.bf16 to float
+;   %op = call float @llvm.copysign.f32(float %mag, float %sign)
+;   %cast = bitcast float %op to i32
+;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+;   ret i32 %readlane
+; }
+
+declare half @llvm.copysign.f16(half, half)
+
+define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
+; GCN-LABEL: v_copysign_f16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_f16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_f16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_f16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %sign = bitcast bfloat %sign.bf16 to half
+  %op = call half @llvm.copysign.f16(half %mag, half %sign)
+  ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
+; GCN-LABEL: s_copysign_f16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_f16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_f16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT:    s_or_b32 s0, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_f16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    ; return to shader part epilog
+  %sign = bitcast bfloat %sign.bf16 to half
+  %op = call half @llvm.copysign.f16(half %mag, half %sign)
+  %cast = bitcast half %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare double @llvm.copysign.f64(double, double)
+
+; FIXME: unable to translate instruction: fpext
+; define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
+;   %sign = fpext bfloat %sign.bf16 to double
+;   %op = call double @llvm.copysign.f64(double %mag, double %sign)
+;   ret double %op
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
+;   %sign = fpext bfloat %sign.bf16 to double
+;   %op = call double @llvm.copysign.f64(double %mag, double %sign)
+;   %cast = bitcast double %op to <2 x i32>
+;   %cast.0 = extractelement <2 x i32> %cast, i32 0
+;   %cast.1 = extractelement <2 x i32> %cast, i32 1
+;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
+;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
+;   %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+;   %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
+;   ret <2 x i32> %ins.1
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
+;   %op = fptosi bfloat %x to i16
+;   ret i16 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
+;   %op = fptosi <2 x bfloat> %x to <2 x i16>
+;   ret <2 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
+;   %op = fptosi <3 x bfloat> %x to <3 x i16>
+;   ret <3 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
+;   %op = fptosi <4 x bfloat> %x to <4 x i16>
+;   ret <4 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
+;   %op = fptosi bfloat %x to i32
+;   ret i32 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
+;   %op = fptosi <2 x bfloat> %x to <2 x i32>
+;   ret <2 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
+;   %op = fptosi <3 x bfloat> %x to <3 x i32>
+;   ret <3 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
+;   %op = fptosi <4 x bfloat> %x to <4 x i32>
+;   ret <4 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
+;   %op = fptosi bfloat %x to i64
+;   ret i64 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
+;   %op = fptosi <2 x bfloat> %x to <2 x i64>
+;   ret <2 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
+;   %op = fptosi <3 x bfloat> %x to <3 x i64>
+;   ret <3 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
+;   %op = fptosi <4 x bfloat> %x to <4 x i64>
+;   ret <4 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
+;   %op = sitofp i16 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+;   %op = sitofp <2 x i16> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+;   %op = sitofp <3 x i16> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+;   %op = sitofp <4 x i16> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
+;   %op = sitofp i32 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+;   %op = sitofp <2 x i32> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+;   %op = sitofp <3 x i32> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+;   %op = sitofp <4 x i32> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
+;   %op = sitofp i64 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+;   %op = sitofp <2 x i64> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+;   %op = sitofp <3 x i64> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+;   %op = sitofp <4 x i64> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
+;   %op = uitofp i16 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+;   %op = uitofp <2 x i16> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+;   %op = uitofp <3 x i16> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+;   %op = uitofp <4 x i16> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
+;   %op = uitofp i32 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+;   %op = uitofp <2 x i32> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+;   %op = uitofp <3 x i32> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+;   %op = uitofp <4 x i32> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
+;   %op = uitofp i64 %x to bfloat
+;   ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+;   %op = uitofp <2 x i64> %x to <2 x bfloat>
+;   ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+;   %op = uitofp <3 x i64> %x to <3 x bfloat>
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+;   %op = uitofp <4 x i64> %x to <4 x bfloat>
+;   ret <4 x bfloat> %op
+; }
+
+define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, bfloat %a, bfloat %b
+  ret bfloat %op
+}
+
+define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_lhs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_lhs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_lhs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_lhs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_lhs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = fneg bfloat %a
+  %op = select i1 %cond, bfloat %neg.a, bfloat %b
+  ret bfloat %op
+}
+
+define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_rhs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_rhs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_rhs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_rhs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_rhs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %neg.b = fneg bfloat %b
+  %op = select i1 %cond, bfloat %a, bfloat %neg.b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_select_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v4, v3
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v4bf16:
+; GFX7-LABEL: v_select_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %op
+}
+
+define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %op
+}
+
+define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
+; GCN-LABEL: s_select_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v0, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+  %cond = icmp eq i32 %c, 0
+  %op = select i1 %cond, bfloat %a, bfloat %b
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
+;   %cond = icmp eq i32 %c, 0
+;   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+;   %cast = bitcast <2 x bfloat> %op to i32
+;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+;   ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
+;   %cond = icmp eq <2 x i32> %c, zeroinitializer
+;   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+;   %cast = bitcast <2 x bfloat> %op to i32
+;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+;   ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
+;   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+;   ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+;   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
+;   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
+;   ret <6 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+;   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
+;   ret <8 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+;   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
+;   ret <16 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+;   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
+;   ret <32 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
+;   %cond = icmp eq i32 %c, 0
+;   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+;   %cast = bitcast <3 x bfloat> %op to i48
+;   %elt0 = trunc i48 %cast to i32
+;   %elt1.hi = lshr i48 %cast, 32
+;   %elt1 = trunc i48 %elt1.hi to i32
+;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+;   ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
+;   %cond = icmp eq i32 %c, 0
+;   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+;   %cast = bitcast <4 x bfloat> %op to <2 x i32>
+;   %elt0 = extractelement <2 x i32> %cast, i32 0
+;   %elt1 = extractelement <2 x i32> %cast, i32 1
+;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+;   ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
+;   %cond = icmp eq <4 x i32> %c, zeroinitializer
+;   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+;   %cast = bitcast <4 x bfloat> %op to <2 x i32>
+;   %elt0 = extractelement <2 x i32> %cast, i32 0
+;   %elt1 = extractelement <2 x i32> %cast, i32 1
+;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+;   ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+;   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+;   ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+;   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
+;   ret <8 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+;   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
+;   ret <16 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+;   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
+;   ret <32 x bfloat> %op
+; }
+
+declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fma_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v4bf16:
+; GFX8-LABEL: v_fma_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fma_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fma_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX7-NEXT:    v_fma_f32 v2, v2, v5, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
+; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v4bf16:
+; GFX9-LABEL: v_fma_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_bfi_b32 v2, s4, v4, v4
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v4bf16:
+; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_bfi_b32 v2, 0xffff, v4, v4
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fma_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v11
+; GFX7-NEXT:    v_fma_f32 v2, v2, v4, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_fma_f32 v3, v3, v6, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
+; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+  ret <4 x bfloat> %op
+}
+
+declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fmuladd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v1, v3, v2
+; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v7
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v7
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v4
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v4
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v4, v4
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v0, v1, v4
+; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v4bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
-  ret <4 x bfloat> %res
+; GFX10-LABEL: v_fmuladd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT:    v_mov_b32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+  ret <4 x bfloat> %op
 }

>From 6239725c81a4ef1bdf9ca7abd42f8d55dfa5c37a Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 14:05:46 +0100
Subject: [PATCH 3/6] add specific test

---
 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll   |   2 +
 .../AMDGPU/GlobalISel/irtranslate-bf16.ll     | 376 ++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
index aaefb634b132aa..ba292b4d046f88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -9,6 +9,8 @@
 ; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
 ; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
 
+; TODO: Once all cases are working, merge with bf16.ll in parent directory.
+
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
 ; GCN:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
new file mode 100644
index 00000000000000..3206f8e55f44eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+
+; tests bf16 argument & return values lowering.
+
+define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v3bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT4]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT5]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> <i32 3, i32 1, i32 2>
+  ret <3 x bfloat> %res
+}
+
+define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v4bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT4]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT5]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+  ret <4 x bfloat> %res
+}
+
+define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v5bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT6]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT7]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT8]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4>
+  ret <5 x bfloat> %res
+}
+
+define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v6bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT6]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT7]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT8]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5>
+  ret <6 x bfloat> %res
+}
+
+define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v7bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
+  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT8]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT9]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT10]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT11]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6>
+  ret <7 x bfloat> %res
+}
+
+define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v8bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
+  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<8 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7)
+  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT8]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT9]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT10]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT11]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  %res = shufflevector <8 x bfloat> %arg0, <8 x bfloat> zeroinitializer, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x bfloat> %res
+}
+
+define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v16bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+  ; GFX9-NEXT:   [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
+  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+  ; GFX9-NEXT:   [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
+  ; GFX9-NEXT:   [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
+  ; GFX9-NEXT:   [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
+  ; GFX9-NEXT:   [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
+  ; GFX9-NEXT:   [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
+  ; GFX9-NEXT:   [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<16 x s16>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>)
+  ; GFX9-NEXT:   [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<16 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
+  ; GFX9-NEXT:   [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
+  ; GFX9-NEXT:   [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
+  ; GFX9-NEXT:   [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
+  ; GFX9-NEXT:   [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
+  ; GFX9-NEXT:   [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
+  ; GFX9-NEXT:   [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
+  ; GFX9-NEXT:   [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT16]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT17]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT18]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT19]](s32)
+  ; GFX9-NEXT:   $vgpr4 = COPY [[ANYEXT20]](s32)
+  ; GFX9-NEXT:   $vgpr5 = COPY [[ANYEXT21]](s32)
+  ; GFX9-NEXT:   $vgpr6 = COPY [[ANYEXT22]](s32)
+  ; GFX9-NEXT:   $vgpr7 = COPY [[ANYEXT23]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ret <16 x bfloat> %arg0
+}
+
+define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) {
+  ; GFX9-LABEL: name: v32bf16
+  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GFX9-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GFX9-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; GFX9-NEXT:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GFX9-NEXT:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GFX9-NEXT:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; GFX9-NEXT:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; GFX9-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; GFX9-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; GFX9-NEXT:   [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+  ; GFX9-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+  ; GFX9-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+  ; GFX9-NEXT:   [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+  ; GFX9-NEXT:   [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+  ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
+  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+  ; GFX9-NEXT:   [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
+  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+  ; GFX9-NEXT:   [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
+  ; GFX9-NEXT:   [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
+  ; GFX9-NEXT:   [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
+  ; GFX9-NEXT:   [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
+  ; GFX9-NEXT:   [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
+  ; GFX9-NEXT:   [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
+  ; GFX9-NEXT:   [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY8]](s32)
+  ; GFX9-NEXT:   [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
+  ; GFX9-NEXT:   [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
+  ; GFX9-NEXT:   [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY9]](s32)
+  ; GFX9-NEXT:   [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
+  ; GFX9-NEXT:   [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
+  ; GFX9-NEXT:   [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY10]](s32)
+  ; GFX9-NEXT:   [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
+  ; GFX9-NEXT:   [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
+  ; GFX9-NEXT:   [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY11]](s32)
+  ; GFX9-NEXT:   [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
+  ; GFX9-NEXT:   [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
+  ; GFX9-NEXT:   [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY12]](s32)
+  ; GFX9-NEXT:   [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s16)
+  ; GFX9-NEXT:   [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s16)
+  ; GFX9-NEXT:   [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY13]](s32)
+  ; GFX9-NEXT:   [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s16)
+  ; GFX9-NEXT:   [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s16)
+  ; GFX9-NEXT:   [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY14]](s32)
+  ; GFX9-NEXT:   [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s16)
+  ; GFX9-NEXT:   [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s16)
+  ; GFX9-NEXT:   [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY15]](s32)
+  ; GFX9-NEXT:   [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s16)
+  ; GFX9-NEXT:   [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s16)
+  ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32), [[ANYEXT16]](s32), [[ANYEXT17]](s32), [[ANYEXT18]](s32), [[ANYEXT19]](s32), [[ANYEXT20]](s32), [[ANYEXT21]](s32), [[ANYEXT22]](s32), [[ANYEXT23]](s32), [[ANYEXT24]](s32), [[ANYEXT25]](s32), [[ANYEXT26]](s32), [[ANYEXT27]](s32), [[ANYEXT28]](s32), [[ANYEXT29]](s32), [[ANYEXT30]](s32), [[ANYEXT31]](s32)
+  ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<32 x s16>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
+  ; GFX9-NEXT:   [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<32 x s16>)
+  ; GFX9-NEXT:   [[ANYEXT32:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s16)
+  ; GFX9-NEXT:   [[ANYEXT33:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s16)
+  ; GFX9-NEXT:   [[ANYEXT34:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s16)
+  ; GFX9-NEXT:   [[ANYEXT35:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s16)
+  ; GFX9-NEXT:   [[ANYEXT36:%[0-9]+]]:_(s32) = G_ANYEXT [[UV36]](s16)
+  ; GFX9-NEXT:   [[ANYEXT37:%[0-9]+]]:_(s32) = G_ANYEXT [[UV37]](s16)
+  ; GFX9-NEXT:   [[ANYEXT38:%[0-9]+]]:_(s32) = G_ANYEXT [[UV38]](s16)
+  ; GFX9-NEXT:   [[ANYEXT39:%[0-9]+]]:_(s32) = G_ANYEXT [[UV39]](s16)
+  ; GFX9-NEXT:   [[ANYEXT40:%[0-9]+]]:_(s32) = G_ANYEXT [[UV40]](s16)
+  ; GFX9-NEXT:   [[ANYEXT41:%[0-9]+]]:_(s32) = G_ANYEXT [[UV41]](s16)
+  ; GFX9-NEXT:   [[ANYEXT42:%[0-9]+]]:_(s32) = G_ANYEXT [[UV42]](s16)
+  ; GFX9-NEXT:   [[ANYEXT43:%[0-9]+]]:_(s32) = G_ANYEXT [[UV43]](s16)
+  ; GFX9-NEXT:   [[ANYEXT44:%[0-9]+]]:_(s32) = G_ANYEXT [[UV44]](s16)
+  ; GFX9-NEXT:   [[ANYEXT45:%[0-9]+]]:_(s32) = G_ANYEXT [[UV45]](s16)
+  ; GFX9-NEXT:   [[ANYEXT46:%[0-9]+]]:_(s32) = G_ANYEXT [[UV46]](s16)
+  ; GFX9-NEXT:   [[ANYEXT47:%[0-9]+]]:_(s32) = G_ANYEXT [[UV47]](s16)
+  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT32]](s32)
+  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT33]](s32)
+  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT34]](s32)
+  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT35]](s32)
+  ; GFX9-NEXT:   $vgpr4 = COPY [[ANYEXT36]](s32)
+  ; GFX9-NEXT:   $vgpr5 = COPY [[ANYEXT37]](s32)
+  ; GFX9-NEXT:   $vgpr6 = COPY [[ANYEXT38]](s32)
+  ; GFX9-NEXT:   $vgpr7 = COPY [[ANYEXT39]](s32)
+  ; GFX9-NEXT:   $vgpr8 = COPY [[ANYEXT40]](s32)
+  ; GFX9-NEXT:   $vgpr9 = COPY [[ANYEXT41]](s32)
+  ; GFX9-NEXT:   $vgpr10 = COPY [[ANYEXT42]](s32)
+  ; GFX9-NEXT:   $vgpr11 = COPY [[ANYEXT43]](s32)
+  ; GFX9-NEXT:   $vgpr12 = COPY [[ANYEXT44]](s32)
+  ; GFX9-NEXT:   $vgpr13 = COPY [[ANYEXT45]](s32)
+  ; GFX9-NEXT:   $vgpr14 = COPY [[ANYEXT46]](s32)
+  ; GFX9-NEXT:   $vgpr15 = COPY [[ANYEXT47]](s32)
+  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+  ret <32 x bfloat> %arg0
+}

>From 50d8e37c0129e0e3897e0c24be1c9da4088fc089 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 14:08:18 +0100
Subject: [PATCH 4/6] Remove bf16.ll

---
 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 13792 ------------------
 1 file changed, 13792 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
deleted file mode 100644
index ba292b4d046f88..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ /dev/null
@@ -1,13792 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-
-; FIXME: GFX11 cannot select some truncs: %0:vgpr_32(s16) = G_TRUNC %1:vgpr_32(s32)
-; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
-
-; TODO: Once all cases are working, merge with bf16.ll in parent directory.
-
-define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  store bfloat %val, ptr addrspace(1) %out
-  ret void
-}
-
-define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-  ret <2 x bfloat> %load
-}
-
-define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-  ret <3 x bfloat> %load
-}
-
-define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GCN-NEXT:    v_mov_b32_e32 v0, v4
-; GCN-NEXT:    v_mov_b32_e32 v2, v5
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX7-NEXT:    v_mov_b32_e32 v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-  ret <4 x bfloat> %load
-}
-
-define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v6bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GCN-NEXT:    v_mov_b32_e32 v0, v6
-; GCN-NEXT:    v_mov_b32_e32 v2, v7
-; GCN-NEXT:    v_mov_b32_e32 v4, v8
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v6bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX7-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v6bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx3 v[2:4], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v6bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[2:4], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v6bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[2:4], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-  ret <6 x bfloat> %load
-}
-
-define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GCN-NEXT:    v_mov_b32_e32 v0, v8
-; GCN-NEXT:    v_mov_b32_e32 v2, v9
-; GCN-NEXT:    v_mov_b32_e32 v4, v10
-; GCN-NEXT:    v_mov_b32_e32 v6, v11
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v4, v10
-; GFX7-NEXT:    v_mov_b32_e32 v6, v11
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-  ret <8 x bfloat> %load
-}
-
-define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[23:26], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v25
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v26
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v22
-; GCN-NEXT:    v_mov_b32_e32 v0, v23
-; GCN-NEXT:    v_mov_b32_e32 v2, v24
-; GCN-NEXT:    v_mov_b32_e32 v4, v25
-; GCN-NEXT:    v_mov_b32_e32 v6, v26
-; GCN-NEXT:    v_mov_b32_e32 v8, v19
-; GCN-NEXT:    v_mov_b32_e32 v10, v20
-; GCN-NEXT:    v_mov_b32_e32 v12, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
-; GFX7-NEXT:    v_mov_b32_e32 v0, v22
-; GFX7-NEXT:    v_mov_b32_e32 v2, v23
-; GFX7-NEXT:    v_mov_b32_e32 v4, v24
-; GFX7-NEXT:    v_mov_b32_e32 v6, v25
-; GFX7-NEXT:    v_mov_b32_e32 v8, v18
-; GFX7-NEXT:    v_mov_b32_e32 v10, v19
-; GFX7-NEXT:    v_mov_b32_e32 v12, v20
-; GFX7-NEXT:    v_mov_b32_e32 v14, v21
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GFX8-NEXT:    v_mov_b32_e32 v0, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v4, v10
-; GFX8-NEXT:    v_mov_b32_e32 v6, v11
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GFX9-NEXT:    v_mov_b32_e32 v0, v8
-; GFX9-NEXT:    v_mov_b32_e32 v2, v9
-; GFX9-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GFX10-NEXT:    v_mov_b32_e32 v0, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v4, v10
-; GFX10-NEXT:    v_mov_b32_e32 v6, v11
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-  ret <16 x bfloat> %load
-}
-
-define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dwordx4 v[39:42], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshrrev_b32_e32 v38, 16, v34
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v36
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v39
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v40
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v41
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v42
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v48
-; GCN-NEXT:    buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v49
-; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v50
-; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v51
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
-; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
-; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GCN-NEXT:    v_mov_b32_e32 v0, v34
-; GCN-NEXT:    v_mov_b32_e32 v2, v35
-; GCN-NEXT:    v_mov_b32_e32 v4, v36
-; GCN-NEXT:    v_mov_b32_e32 v6, v37
-; GCN-NEXT:    v_mov_b32_e32 v8, v39
-; GCN-NEXT:    v_mov_b32_e32 v10, v40
-; GCN-NEXT:    v_mov_b32_e32 v12, v41
-; GCN-NEXT:    v_mov_b32_e32 v14, v42
-; GCN-NEXT:    v_mov_b32_e32 v16, v48
-; GCN-NEXT:    v_mov_b32_e32 v18, v49
-; GCN-NEXT:    v_mov_b32_e32 v20, v50
-; GCN-NEXT:    v_mov_b32_e32 v22, v51
-; GCN-NEXT:    v_mov_b32_e32 v24, v52
-; GCN-NEXT:    v_mov_b32_e32 v26, v53
-; GCN-NEXT:    v_mov_b32_e32 v28, v54
-; GCN-NEXT:    v_mov_b32_e32 v30, v55
-; GCN-NEXT:    v_mov_b32_e32 v1, v38
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[38:41], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
-; GFX7-NEXT:    v_mov_b32_e32 v4, v40
-; GFX7-NEXT:    v_mov_b32_e32 v6, v41
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
-; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
-; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
-; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
-; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
-; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GFX7-NEXT:    v_mov_b32_e32 v0, v38
-; GFX7-NEXT:    v_mov_b32_e32 v2, v39
-; GFX7-NEXT:    v_mov_b32_e32 v8, v48
-; GFX7-NEXT:    v_mov_b32_e32 v10, v49
-; GFX7-NEXT:    v_mov_b32_e32 v12, v50
-; GFX7-NEXT:    v_mov_b32_e32 v14, v51
-; GFX7-NEXT:    v_mov_b32_e32 v16, v34
-; GFX7-NEXT:    v_mov_b32_e32 v18, v35
-; GFX7-NEXT:    v_mov_b32_e32 v20, v36
-; GFX7-NEXT:    v_mov_b32_e32 v22, v37
-; GFX7-NEXT:    v_mov_b32_e32 v24, v52
-; GFX7-NEXT:    v_mov_b32_e32 v26, v53
-; GFX7-NEXT:    v_mov_b32_e32 v28, v54
-; GFX7-NEXT:    v_mov_b32_e32 v30, v55
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[22:25], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[18:21], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
-; GFX8-NEXT:    v_mov_b32_e32 v0, v22
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
-; GFX8-NEXT:    v_mov_b32_e32 v2, v23
-; GFX8-NEXT:    v_mov_b32_e32 v4, v24
-; GFX8-NEXT:    v_mov_b32_e32 v6, v25
-; GFX8-NEXT:    v_mov_b32_e32 v8, v18
-; GFX8-NEXT:    v_mov_b32_e32 v10, v19
-; GFX8-NEXT:    v_mov_b32_e32 v12, v20
-; GFX8-NEXT:    v_mov_b32_e32 v14, v21
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
-; GFX9-NEXT:    v_mov_b32_e32 v0, v22
-; GFX9-NEXT:    v_mov_b32_e32 v2, v23
-; GFX9-NEXT:    v_mov_b32_e32 v4, v24
-; GFX9-NEXT:    v_mov_b32_e32 v6, v25
-; GFX9-NEXT:    v_mov_b32_e32 v8, v18
-; GFX9-NEXT:    v_mov_b32_e32 v10, v19
-; GFX9-NEXT:    v_mov_b32_e32 v12, v20
-; GFX9-NEXT:    v_mov_b32_e32 v14, v21
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
-; GFX10-NEXT:    v_mov_b32_e32 v0, v22
-; GFX10-NEXT:    v_mov_b32_e32 v2, v23
-; GFX10-NEXT:    v_mov_b32_e32 v4, v24
-; GFX10-NEXT:    v_mov_b32_e32 v6, v25
-; GFX10-NEXT:    v_mov_b32_e32 v8, v18
-; GFX10-NEXT:    v_mov_b32_e32 v10, v19
-; GFX10-NEXT:    v_mov_b32_e32 v12, v20
-; GFX10-NEXT:    v_mov_b32_e32 v14, v21
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-  ret <32 x bfloat> %load
-}
-
-define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v64bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:64
-; GCN-NEXT:    buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
-; GCN-NEXT:    buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
-; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
-; GCN-NEXT:    buffer_store_dword v22, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
-; GCN-NEXT:    buffer_store_dword v23, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 12, v0
-; GCN-NEXT:    buffer_store_dword v24, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 16, v0
-; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v0
-; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
-; GCN-NEXT:    s_waitcnt vmcnt(10)
-; GCN-NEXT:    buffer_store_dword v25, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 28, v0
-; GCN-NEXT:    buffer_store_dword v26, v22, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 32, v0
-; GCN-NEXT:    buffer_store_dword v27, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
-; GCN-NEXT:    buffer_store_dword v28, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 44, v0
-; GCN-NEXT:    s_waitcnt expcnt(3)
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 48, v0
-; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 52, v0
-; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 56, v0
-; GCN-NEXT:    s_waitcnt vmcnt(13)
-; GCN-NEXT:    buffer_store_dword v29, v22, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 60, v0
-; GCN-NEXT:    buffer_store_dword v30, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v31, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v21, 0x44
-; GCN-NEXT:    buffer_store_dword v32, v24, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v24, 0x48
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    v_mov_b32_e32 v28, 0x4c
-; GCN-NEXT:    s_waitcnt expcnt(3)
-; GCN-NEXT:    v_mov_b32_e32 v29, 0x50
-; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    v_mov_b32_e32 v30, 0x54
-; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    v_mov_b32_e32 v31, 0x58
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    buffer_store_dword v17, v25, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v17, 0x5c
-; GCN-NEXT:    buffer_store_dword v18, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v18, 0x60
-; GCN-NEXT:    buffer_store_dword v19, v27, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v19, 0x64
-; GCN-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v20, 0x68
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x6c, v0
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x74, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, v0, v21
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, v0, v24
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, v0, v28
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, v0, v29
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, v0, v30
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, v0, v31
-; GCN-NEXT:    v_add_i32_e32 v17, vcc, v0, v17
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, v0, v18
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, v0, v19
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, v0, v20
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; GCN-NEXT:    buffer_store_dword v13, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v17, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    buffer_store_dword v5, v18, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v64bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:64
-; GFX7-NEXT:    buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT:    buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT:    buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
-; GFX7-NEXT:    buffer_store_dword v22, v21, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v23, v21, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 12, v0
-; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v24, v21, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(10)
-; GFX7-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v26, v23, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v27, v23, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 28, v0
-; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v28, v23, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(13)
-; GFX7-NEXT:    buffer_store_dword v29, v26, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 40, v0
-; GFX7-NEXT:    v_mov_b32_e32 v21, 0x44
-; GFX7-NEXT:    v_mov_b32_e32 v22, 0x48
-; GFX7-NEXT:    v_mov_b32_e32 v23, 0x4c
-; GFX7-NEXT:    v_mov_b32_e32 v24, 0x50
-; GFX7-NEXT:    v_mov_b32_e32 v25, 0x54
-; GFX7-NEXT:    buffer_store_dword v30, v27, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v31, v26, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v32, v27, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 52, v0
-; GFX7-NEXT:    v_add_i32_e32 v28, vcc, 56, v0
-; GFX7-NEXT:    v_add_i32_e32 v29, vcc, 60, v0
-; GFX7-NEXT:    v_add_i32_e32 v30, vcc, 64, v0
-; GFX7-NEXT:    v_add_i32_e32 v21, vcc, v0, v21
-; GFX7-NEXT:    v_add_i32_e32 v22, vcc, v0, v22
-; GFX7-NEXT:    v_add_i32_e32 v23, vcc, v0, v23
-; GFX7-NEXT:    v_add_i32_e32 v24, vcc, v0, v24
-; GFX7-NEXT:    v_add_i32_e32 v25, vcc, v0, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    buffer_store_dword v13, v26, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v14, v27, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v15, v28, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v16, v29, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v17, v30, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v18, v21, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v19, v22, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v20, v23, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v9, v24, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v10, v25, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x58
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT:    buffer_store_dword v11, v9, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x5c
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x60
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x64
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x68
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v7, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
-; GFX7-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
-; GFX7-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; GFX7-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v64bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT:    flat_load_dwordx4 v[38:41], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[48:51], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[34:37], v[2:3]
-; GFX8-NEXT:    flat_load_dwordx4 v[52:55], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
-; GFX8-NEXT:    v_mov_b32_e32 v4, v40
-; GFX8-NEXT:    v_mov_b32_e32 v6, v41
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
-; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
-; GFX8-NEXT:    v_mov_b32_e32 v0, v38
-; GFX8-NEXT:    v_mov_b32_e32 v2, v39
-; GFX8-NEXT:    v_mov_b32_e32 v8, v48
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GFX8-NEXT:    v_mov_b32_e32 v10, v49
-; GFX8-NEXT:    v_mov_b32_e32 v12, v50
-; GFX8-NEXT:    v_mov_b32_e32 v14, v51
-; GFX8-NEXT:    v_mov_b32_e32 v16, v34
-; GFX8-NEXT:    v_mov_b32_e32 v18, v35
-; GFX8-NEXT:    v_mov_b32_e32 v20, v36
-; GFX8-NEXT:    v_mov_b32_e32 v22, v37
-; GFX8-NEXT:    v_mov_b32_e32 v24, v52
-; GFX8-NEXT:    v_mov_b32_e32 v26, v53
-; GFX8-NEXT:    v_mov_b32_e32 v28, v54
-; GFX8-NEXT:    v_mov_b32_e32 v30, v55
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT:    global_load_dwordx4 v[38:41], v[0:1], off
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[34:37], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v40
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v41
-; GFX9-NEXT:    v_mov_b32_e32 v4, v40
-; GFX9-NEXT:    v_mov_b32_e32 v6, v41
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GFX9-NEXT:    v_mov_b32_e32 v0, v38
-; GFX9-NEXT:    v_mov_b32_e32 v2, v39
-; GFX9-NEXT:    v_mov_b32_e32 v8, v48
-; GFX9-NEXT:    v_mov_b32_e32 v10, v49
-; GFX9-NEXT:    v_mov_b32_e32 v12, v50
-; GFX9-NEXT:    v_mov_b32_e32 v14, v51
-; GFX9-NEXT:    v_mov_b32_e32 v16, v34
-; GFX9-NEXT:    v_mov_b32_e32 v18, v35
-; GFX9-NEXT:    v_mov_b32_e32 v20, v36
-; GFX9-NEXT:    v_mov_b32_e32 v22, v37
-; GFX9-NEXT:    v_mov_b32_e32 v24, v52
-; GFX9-NEXT:    v_mov_b32_e32 v26, v53
-; GFX9-NEXT:    v_mov_b32_e32 v28, v54
-; GFX9-NEXT:    v_mov_b32_e32 v30, v55
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v64bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[64:67], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[34:37], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v64
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v65
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v66
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v67
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v49
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v50
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v51
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v37
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v52
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v53
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v54
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
-; GFX10-NEXT:    v_mov_b32_e32 v0, v64
-; GFX10-NEXT:    v_mov_b32_e32 v2, v65
-; GFX10-NEXT:    v_mov_b32_e32 v4, v66
-; GFX10-NEXT:    v_mov_b32_e32 v6, v67
-; GFX10-NEXT:    v_mov_b32_e32 v8, v48
-; GFX10-NEXT:    v_mov_b32_e32 v10, v49
-; GFX10-NEXT:    v_mov_b32_e32 v12, v50
-; GFX10-NEXT:    v_mov_b32_e32 v14, v51
-; GFX10-NEXT:    v_mov_b32_e32 v16, v34
-; GFX10-NEXT:    v_mov_b32_e32 v18, v35
-; GFX10-NEXT:    v_mov_b32_e32 v20, v36
-; GFX10-NEXT:    v_mov_b32_e32 v22, v37
-; GFX10-NEXT:    v_mov_b32_e32 v24, v52
-; GFX10-NEXT:    v_mov_b32_e32 v26, v53
-; GFX10-NEXT:    v_mov_b32_e32 v28, v54
-; GFX10-NEXT:    v_mov_b32_e32 v30, v55
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
-  ret <64 x bfloat> %load
-}
-
-define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_dword v[1:2], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_store_short v[4:5], v6
-; GFX8-NEXT:    flat_store_short v[2:3], v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <3 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <4 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <8 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <16 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v10
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v5
-; GCN-NEXT:    v_or_b32_e32 v5, v6, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v9
-; GCN-NEXT:    v_or_b32_e32 v9, v10, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v20
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v22
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v24
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v27
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v26
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v28
-; GCN-NEXT:    v_and_b32_e32 v19, 0xffff, v30
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
-; GCN-NEXT:    v_or_b32_e32 v12, v13, v14
-; GCN-NEXT:    v_or_b32_e32 v13, v15, v16
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; GCN-NEXT:    v_or_b32_e32 v14, v17, v18
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
-; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v10
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v19
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v18
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v20
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v23
-; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v22
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v24
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v27
-; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v26
-; GFX7-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX7-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX7-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
-; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v28
-; GFX7-NEXT:    v_or_b32_e32 v14, v14, v16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff, v30
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_or_b32_e32 v15, v15, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
-; GFX8-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
-; GFX8-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
-; GFX8-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
-; GFX8-NEXT:    v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v14
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
-; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
-; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v12, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
-; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
-; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
-; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <32 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v64bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v12
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v14
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v17
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v19
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v23
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v22
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v25
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v24
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v26
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:4
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v0, v6, v7
-; GCN-NEXT:    v_or_b32_e32 v1, v8, v9
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_or_b32_e32 v2, v10, v11
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_or_b32_e32 v3, v12, v13
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:24
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v29
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v28
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:32
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v0, v14, v15
-; GCN-NEXT:    v_or_b32_e32 v1, v16, v17
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:40
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:44
-; GCN-NEXT:    v_or_b32_e32 v2, v12, v13
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v30
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GCN-NEXT:    v_or_b32_e32 v3, v16, v3
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:56
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68
-; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v19
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v7
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:72
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80
-; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v11
-; GCN-NEXT:    s_waitcnt vmcnt(12)
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v14
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GCN-NEXT:    s_waitcnt vmcnt(10)
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v15
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v6
-; GCN-NEXT:    v_or_b32_e32 v6, v7, v8
-; GCN-NEXT:    v_or_b32_e32 v7, v10, v11
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:88
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; GCN-NEXT:    s_waitcnt vmcnt(10)
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v16
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v13
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v10
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:100
-; GCN-NEXT:    s_waitcnt vmcnt(9)
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v18
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v17
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v22
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v19, 0xffff, v23
-; GCN-NEXT:    v_or_b32_e32 v9, v10, v11
-; GCN-NEXT:    v_or_b32_e32 v10, v16, v17
-; GCN-NEXT:    v_or_b32_e32 v11, v18, v19
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:104
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:108
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v24
-; GCN-NEXT:    v_or_b32_e32 v12, v12, v18
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:112
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v14
-; GCN-NEXT:    v_or_b32_e32 v14, v16, v15
-; GCN-NEXT:    v_or_b32_e32 v15, v18, v17
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:124
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:64
-; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 offset:80
-; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[4:5], s[4:7], 0 addr64 offset:96
-; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[4:5], s[4:7], 0 addr64 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v64bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v35, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v37, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v38, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; GFX7-NEXT:    v_or_b32_e32 v36, v3, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX7-NEXT:    v_or_b32_e32 v33, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; GFX7-NEXT:    v_or_b32_e32 v32, v2, v3
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32
-; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_or_b32_e32 v34, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v17
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v21
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v20
-; GFX7-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v22
-; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v25
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v24
-; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132
-; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v26
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v28
-; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v30
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX7-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX7-NEXT:    v_or_b32_e32 v10, v12, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    buffer_store_dwordx4 v[35:38], v[24:25], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    buffer_store_dwordx4 v[31:34], v[24:25], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT:    v_or_b32_e32 v11, v12, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v14
-; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(13)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(12)
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v17
-; GFX7-NEXT:    v_or_b32_e32 v13, v13, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(10)
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(9)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v19
-; GFX7-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(6)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v21
-; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v20
-; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff, v22
-; GFX7-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v27
-; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff, v26
-; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v29
-; GFX7-NEXT:    v_or_b32_e32 v18, v18, v23
-; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(7)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX7-NEXT:    v_or_b32_e32 v19, v20, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(5)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX7-NEXT:    s_waitcnt vmcnt(4)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v26
-; GFX7-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff, v28
-; GFX7-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT:    v_or_b32_e32 v23, v23, v26
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[24:25], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[24:25], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[24:25], s[4:7], 0 addr64 offset:64
-; GFX7-NEXT:    buffer_store_dwordx4 v[12:15], v[24:25], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT:    buffer_store_dwordx4 v[16:19], v[24:25], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT:    buffer_store_dwordx4 v[20:23], v[24:25], s[4:7], 0 addr64 offset:112
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v64bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX8-NEXT:    v_mov_b32_sdwa v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
-; GFX8-NEXT:    v_mov_b32_sdwa v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX8-NEXT:    v_mov_b32_sdwa v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
-; GFX8-NEXT:    v_mov_b32_sdwa v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
-; GFX8-NEXT:    v_mov_b32_sdwa v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
-; GFX8-NEXT:    v_mov_b32_sdwa v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
-; GFX8-NEXT:    v_mov_b32_sdwa v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
-; GFX8-NEXT:    v_mov_b32_sdwa v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
-; GFX8-NEXT:    v_mov_b32_sdwa v12, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_mov_b32_sdwa v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
-; GFX8-NEXT:    v_mov_b32_sdwa v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_mov_b32_sdwa v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v32
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v32
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; GFX8-NEXT:    v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX8-NEXT:    v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; GFX8-NEXT:    v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX8-NEXT:    v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX8-NEXT:    v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX8-NEXT:    v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; GFX8-NEXT:    v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX8-NEXT:    v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; GFX8-NEXT:    v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX8-NEXT:    v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
-; GFX8-NEXT:    v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX8-NEXT:    v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
-; GFX8-NEXT:    v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX8-NEXT:    v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GFX8-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v32
-; GFX8-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x60, v32
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x70, v32
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[28:31]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v64bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v0
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v3
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
-; GFX9-NEXT:    v_mov_b32_sdwa v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX9-NEXT:    v_mov_b32_sdwa v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
-; GFX9-NEXT:    v_mov_b32_sdwa v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_mov_b32_sdwa v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; GFX9-NEXT:    v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; GFX9-NEXT:    v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; GFX9-NEXT:    v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
-; GFX9-NEXT:    v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
-; GFX9-NEXT:    v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; GFX9-NEXT:    v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX9-NEXT:    v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; GFX9-NEXT:    v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX9-NEXT:    v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
-; GFX9-NEXT:    v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
-; GFX9-NEXT:    v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
-; GFX9-NEXT:    v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX9-NEXT:    v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX9-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v64bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v23
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v24
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v25
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v26
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT:    v_mov_b32_sdwa v19, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v12, v54 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v13, v55 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v14, v64 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v15, v65 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v16, v66 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v17, v67 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v18, v68 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v20, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v21, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v22, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v23, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v24, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v25, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v26, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v27, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v29, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v30, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
-; GFX10-NEXT:    v_mov_b32_sdwa v31, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <64 x bfloat> %val, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
-; GCN-LABEL: test_store_fpimm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_store_fpimm:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_store_fpimm:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX8-NEXT:    flat_store_short v[0:1], v4
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_store_fpimm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX9-NEXT:    global_store_short v[0:1], v4, off
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_store_fpimm:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GFX10-NEXT:    global_store_short v[0:1], v4, off
-; GFX10-NEXT:    global_store_short v[2:3], v5, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat 1.0, ptr addrspace(1) %ptr0
-  store bfloat 42.0, ptr addrspace(1) %ptr1
-  ret void
-}
-
-; FIXME: unable to translate instruction: fptrunc
-; define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-;   %val = load float, ptr addrspace(1) %in
-;   %val.bf16 = fptrunc float %val to bfloat
-;   store bfloat %val.bf16, ptr addrspace(1) %out
-;   ret void
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-;   %val = load double, ptr addrspace(1) %in
-;   %val.bf16 = fptrunc double %val to bfloat
-;   store bfloat %val.bf16, ptr addrspace(1) %out
-;   ret void
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-;   %val = load bfloat, ptr addrspace(1) %in
-;   %val.f32 = fpext bfloat %val to float
-;   store float %val.f32, ptr addrspace(1) %out
-;   ret void
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-;   %val = load bfloat, ptr addrspace(1) %in
-;   %val.f64 = fpext bfloat %val to double
-;   store double %val.f64, ptr addrspace(1) %out
-;   ret void
-; }
-
-define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x bfloat>, ptr addrspace(1) %in
-  store <2 x bfloat> %val, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x bfloat>, ptr addrspace(1) %in
-  store <4 x bfloat> %val, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <8 x bfloat>, ptr addrspace(1) %in
-  store <8 x bfloat> %val, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <16 x bfloat>, ptr addrspace(1) %in
-  store <16 x bfloat> %val, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_short v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short v[1:2], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_dword v[1:2], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x bfloat> %in, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
-; GCN-LABEL: test_arg_store_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    flat_store_short v[4:5], v6
-; GFX8-NEXT:    flat_store_short v[2:3], v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
-  ret void
-}
-
-define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <4 x bfloat> %in, ptr addrspace(1)  %out
-  ret void
-}
-
-define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <8 x bfloat> %in, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store <16 x bfloat> %in, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_inreg_arg_store:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    s_mov_b32 s38, 0
-; GCN-NEXT:    s_mov_b32 s39, 0xf000
-; GCN-NEXT:    s_mov_b64 s[36:37], 0
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_inreg_arg_store:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_mov_b32 s38, 0
-; GFX7-NEXT:    s_mov_b32 s39, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[36:37], 0
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_inreg_arg_store:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_inreg_arg_store:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_inreg_arg_store:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_short v[0:1], v2, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
-  ret void
-}
-
-define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
-; GCN-LABEL: test_byval:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_byval:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_byval:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_byval:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_byval:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %bv
-  %retval = load bfloat, ptr addrspace(5) %bv
-  ret bfloat %retval
-}
-
-define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
-; GCN-LABEL: test_sret:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_sret:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_sret:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_sret:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_sret:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %sret
-  ret void
-}
-
-define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_bitcast_from_bfloat:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_bitcast_from_bfloat:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_bitcast_from_bfloat:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_bitcast_from_bfloat:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_bitcast_from_bfloat:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val_int = bitcast bfloat %val to i16
-  store i16 %val_int, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-LABEL: test_bitcast_to_bfloat:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_bitcast_to_bfloat:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_bitcast_to_bfloat:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_bitcast_to_bfloat:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_bitcast_to_bfloat:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[0:1], v2, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i16, ptr addrspace(1) %in
-  %val_fp = bitcast i16 %val to bfloat
-  store bfloat %val_fp, ptr addrspace(1) %out
-  ret void
-}
-
-define bfloat @test_ret(bfloat %in) {
-; GCN-LABEL: test_ret:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret bfloat %in
-}
-
-define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
-; GCN-LABEL: test_ret_v2bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <2 x bfloat> %in
-}
-
-define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
-; GCN-LABEL: test_ret_v3bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <3 x bfloat> %in
-}
-
-define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
-; GCN-LABEL: test_ret_v4bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <4 x bfloat> %in
-}
-
-define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
-; GCN-LABEL: test_ret_v8bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <8 x bfloat> %in
-}
-
-define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
-; GCN-LABEL: test_ret_v16bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, v8
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    v_mov_b32_e32 v6, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <16 x bfloat> %in
-}
-
-define void @test_call(bfloat %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v2, s30, 0
-; GCN-NEXT:    v_writelane_b32 v2, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
-; GCN-NEXT:    v_readlane_b32 s30, v2, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call bfloat @test_arg_store(bfloat %in)
-  store volatile bfloat %result, ptr addrspace(5) %out
-  ret void
-}
-
-define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v2bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v3, s30, 0
-; GCN-NEXT:    v_writelane_b32 v3, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v3, 1
-; GCN-NEXT:    v_readlane_b32 s30, v3, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
-  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
-  ret void
-}
-
-define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v3bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v4, s30, 0
-; GCN-NEXT:    v_writelane_b32 v4, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
-; GCN-NEXT:    v_readlane_b32 s30, v4, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v3
-; GFX7-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
-  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
-  ret void
-}
-
-define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v4bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v5, s30, 0
-; GCN-NEXT:    v_writelane_b32 v5, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
-; GCN-NEXT:    v_readlane_b32 s30, v5, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
-; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
-  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
-  ret void
-}
-
-define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v8bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v9, s30, 0
-; GCN-NEXT:    v_writelane_b32 v9, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 4, v8
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 8, v8
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 12, v8
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, v10, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v11, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, v12, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v9, 1
-; GCN-NEXT:    v_readlane_b32 s30, v9, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v8
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v8
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v8
-; GFX7-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v4
-; GFX8-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
-  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
-  ret void
-}
-
-define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v16bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v17, s30, 0
-; GCN-NEXT:    v_writelane_b32 v17, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 28, v16
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NEXT:    buffer_store_dword v0, v16, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v19, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, v20, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, v21, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, v22, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, v23, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v7, v24, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v17, 1
-; GCN-NEXT:    v_readlane_b32 s30, v17, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT:    buffer_store_dword v0, v16, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v16
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v16
-; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v16
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 16, v16
-; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 20, v16
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 24, v16
-; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 28, v16
-; GFX7-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    v_mov_b32_e32 v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v6, v3
-; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, v10
-; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 20, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 24, v8
-; GFX8-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 28, v8
-; GFX8-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v10, v2
-; GFX10-NEXT:    v_mov_b32_e32 v6, v3
-; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_mov_b32_e32 v2, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v10
-; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
-  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
-  ret void
-}
-
-define bfloat @test_alloca_load_store_ret(bfloat %in) {
-; GCN-LABEL: test_alloca_load_store_ret:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_alloca_load_store_ret:
-; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_alloca_load_store_ret:
-; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_alloca_load_store_ret:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %in.addr = alloca bfloat, align 2, addrspace(5)
-  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
-  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
-  ret bfloat %loaded
-}
-
-define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
-; GCN-LABEL: test_overflow_stack:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GCN-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
-; GCN-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GCN-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
-; GCN-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
-; GCN-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
-; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; GCN-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
-; GCN-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
-; GCN-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
-; GCN-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; GCN-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; GCN-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; GCN-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v5, 0x44
-; GCN-NEXT:    v_mov_b32_e32 v7, 0x48
-; GCN-NEXT:    buffer_store_dword v18, v6, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v6, 0x4c
-; GCN-NEXT:    v_mov_b32_e32 v8, 0x50
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v5, 0x54
-; GCN-NEXT:    v_mov_b32_e32 v9, 0x58
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
-; GCN-NEXT:    buffer_store_dword v20, v7, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v7, 0x5c
-; GCN-NEXT:    v_mov_b32_e32 v10, 0x60
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
-; GCN-NEXT:    buffer_store_dword v21, v6, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v6, 0x64
-; GCN-NEXT:    v_mov_b32_e32 v11, 0x68
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v0, v8
-; GCN-NEXT:    buffer_store_dword v22, v8, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v8, 0x6c
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 0x74, v0
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x78, v0
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, v0, v9
-; GCN-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x7c, v0
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x80, v0
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v0, v10
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v0, v11
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT:    buffer_store_dword v25, v7, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v26, v10, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v27, v6, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v30, v12, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_short v1, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_overflow_stack:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
-; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
-; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
-; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
-; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; GFX7-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; GFX7-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 64, v0
-; GFX7-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x44
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x48
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x4c
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x54
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x58
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x60
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x64
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x68
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x6c
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT:    buffer_store_dword v29, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v30, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x74, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x7c, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
-; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_overflow_stack:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
-; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
-; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
-; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
-; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
-; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
-; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 48, v0
-; GFX8-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 52, v0
-; GFX8-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x44
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x48
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4c
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x54
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x58
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x60
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x64
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x68
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x6c
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT:    buffer_store_dword v29, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x70, v0
-; GFX8-NEXT:    buffer_store_dword v30, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x74, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
-; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7c, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
-; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_overflow_stack:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(20)
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(20)
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(20)
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_overflow_stack:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
-  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
-  ret { <32 x i32>, bfloat } %ins.1
-}
-
-; FIXME: unable to translate instruction: fpext
-; define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
-;   %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <2 x bfloat> %load to <2 x float>
-;   ret <2 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
-;   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <3 x bfloat> %load to <3 x float>
-;   ret <3 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
-;   %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <4 x bfloat> %load to <4 x float>
-;   ret <4 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
-;   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <5 x bfloat> %load to <5 x float>
-;   ret <5 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
-;   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <6 x bfloat> %load to <6 x float>
-;   ret <6 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
-;   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <8 x bfloat> %load to <8 x float>
-;   ret <8 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
-;   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <16 x bfloat> %load to <16 x float>
-;   ret <16 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
-;   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <32 x bfloat> %load to <32 x float>
-;   ret <32 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
-;   %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <2 x bfloat> %load to <2 x double>
-;   ret <2 x double> %fpext
-; }
-
-; define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
-;   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <3 x bfloat> %load to <3 x double>
-;   ret <3 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
-;   %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <4 x bfloat> %load to <4 x double>
-;   ret <4 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
-;   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <5 x bfloat> %load to <5 x double>
-;   ret <5 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
-;   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <6 x bfloat> %load to <6 x double>
-;   ret <6 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
-;   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <8 x bfloat> %load to <8 x double>
-;   ret <8 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
-;   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <16 x bfloat> %load to <16 x double>
-;   ret <16 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
-;   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-;   %fpext = fpext <32 x bfloat> %load to <32 x double>
-;   ret <32 x double> %fpext
-; }
-
-define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fadd_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd bfloat %a, %b
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v2, v0, v1
-; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <2 x bfloat> %a, %b
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v6, v0, v4
-; GFX8-NEXT:    v_add_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v2, v1, v5
-; GFX8-NEXT:    v_add_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_add_f16 v2, v1, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX10-NEXT:    v_pk_add_f16 v2, v1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <8 x bfloat> %a, %b
-  ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v17
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v18
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v19
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v18
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v17
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v12, v0, v8
-; GFX8-NEXT:    v_add_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v13, v1, v9
-; GFX8-NEXT:    v_add_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v4, v2, v10
-; GFX8-NEXT:    v_add_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v6, v3, v11
-; GFX8-NEXT:    v_add_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v12
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v8
-; GFX9-NEXT:    v_pk_add_f16 v8, v1, v9
-; GFX9-NEXT:    v_pk_add_f16 v4, v2, v10
-; GFX9-NEXT:    v_pk_add_f16 v6, v3, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v8
-; GFX10-NEXT:    v_pk_add_f16 v8, v1, v9
-; GFX10-NEXT:    v_pk_add_f16 v4, v2, v10
-; GFX10-NEXT:    v_pk_add_f16 v6, v3, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <16 x bfloat> %a, %b
-  ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT:    v_add_f32_e32 v17, v17, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT:    v_add_f32_e32 v19, v19, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT:    v_add_f32_e32 v21, v21, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT:    v_add_f32_e32 v23, v23, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT:    v_add_f32_e32 v25, v25, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT:    v_add_f32_e32 v27, v27, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_add_f32_e32 v29, v29, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_add_f32_e32 v30, v30, v31
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT:    v_add_f32_e32 v31, v31, v32
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v16, v16, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v17, v17, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v18, v18, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v19, v19, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v20, v20, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v21, v21, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v22, v22, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v23, v23, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v24, v24, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v25, v25, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v26, v26, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v27, v27, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v28, v28, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v29, v29, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v30, v30, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_add_f32_e32 v31, v31, v32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v24, v0, v16
-; GFX8-NEXT:    v_add_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v25, v1, v17
-; GFX8-NEXT:    v_add_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v26, v2, v18
-; GFX8-NEXT:    v_add_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v27, v3, v19
-; GFX8-NEXT:    v_add_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v8, v4, v20
-; GFX8-NEXT:    v_add_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v10, v5, v21
-; GFX8-NEXT:    v_add_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v12, v6, v22
-; GFX8-NEXT:    v_add_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v14, v7, v23
-; GFX8-NEXT:    v_add_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v24
-; GFX8-NEXT:    v_mov_b32_e32 v1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v2, v25
-; GFX8-NEXT:    v_mov_b32_e32 v3, v17
-; GFX8-NEXT:    v_mov_b32_e32 v4, v26
-; GFX8-NEXT:    v_mov_b32_e32 v5, v18
-; GFX8-NEXT:    v_mov_b32_e32 v6, v27
-; GFX8-NEXT:    v_mov_b32_e32 v7, v19
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v16
-; GFX9-NEXT:    v_pk_add_f16 v16, v1, v17
-; GFX9-NEXT:    v_pk_add_f16 v18, v2, v18
-; GFX9-NEXT:    v_pk_add_f16 v17, v3, v19
-; GFX9-NEXT:    v_pk_add_f16 v8, v4, v20
-; GFX9-NEXT:    v_pk_add_f16 v10, v5, v21
-; GFX9-NEXT:    v_pk_add_f16 v12, v6, v22
-; GFX9-NEXT:    v_pk_add_f16 v14, v7, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-NEXT:    v_mov_b32_e32 v4, v18
-; GFX9-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v16
-; GFX10-NEXT:    v_pk_add_f16 v16, v1, v17
-; GFX10-NEXT:    v_pk_add_f16 v18, v2, v18
-; GFX10-NEXT:    v_pk_add_f16 v17, v3, v19
-; GFX10-NEXT:    v_pk_add_f16 v8, v4, v20
-; GFX10-NEXT:    v_pk_add_f16 v10, v5, v21
-; GFX10-NEXT:    v_pk_add_f16 v12, v6, v22
-; GFX10-NEXT:    v_pk_add_f16 v14, v7, v23
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_mov_b32_e32 v2, v16
-; GFX10-NEXT:    v_mov_b32_e32 v4, v18
-; GFX10-NEXT:    v_mov_b32_e32 v6, v17
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <32 x bfloat> %a, %b
-  ret <32 x bfloat> %op
-}
-
-define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
-; GCN-LABEL: v_fadd_bf16_fpimm_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, 0x3f80
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16_fpimm_0:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, 0x3f80
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16_fpimm_0:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16_fpimm_0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16_fpimm_0:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v0, 0x3f80, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %add = fadd bfloat %arg0, 1.0
-  ret bfloat %add
-}
-
-define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
-; GCN-LABEL: v_fadd_bf16_fpimm_1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, 0x4228
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16_fpimm_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, 0x4228
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16_fpimm_1:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_f16_e32 v0, 0x4228, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16_fpimm_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_f16_e32 v0, 0x4228, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16_fpimm_1:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_f16_e32 v0, 0x4228, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %add = fadd bfloat %arg0, 42.0
-  ret bfloat %add
-}
-
-define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fsub_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sub_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub bfloat %a, %b
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; GFX8-NEXT:    v_add_f16_e32 v2, v0, v1
-; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <2 x bfloat> %a, %b
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e64 v3, -v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e64 v4, -v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v5
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v4
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e64 v4, -v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e64 v6, -v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e64 v7, -v7
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v5, -v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v4, -v6
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v5, -v7
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sub_f16_e32 v3, v0, v2
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
-}
-
-define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fmul_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul bfloat %a, %b
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <2 x bfloat> %a, %b
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v4
-; GFX8-NEXT:    v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v5
-; GFX8-NEXT:    v_mul_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_mul_f16 v2, v1, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
-; GFX10-NEXT:    v_pk_mul_f16 v2, v1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <8 x bfloat> %a, %b
-  ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT:    v_mul_f32_e32 v4, v4, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT:    v_mul_f32_e32 v5, v5, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT:    v_mul_f32_e32 v6, v6, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT:    v_mul_f32_e32 v7, v7, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT:    v_mul_f32_e32 v8, v8, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT:    v_mul_f32_e32 v9, v9, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT:    v_mul_f32_e32 v10, v10, v16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT:    v_mul_f32_e32 v11, v11, v17
-; GCN-NEXT:    v_mul_f32_e32 v12, v12, v18
-; GCN-NEXT:    v_mul_f32_e32 v13, v13, v19
-; GCN-NEXT:    v_mul_f32_e32 v14, v14, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v18
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v17
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v12, v0, v8
-; GFX8-NEXT:    v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v13, v1, v9
-; GFX8-NEXT:    v_mul_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v10
-; GFX8-NEXT:    v_mul_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v11
-; GFX8-NEXT:    v_mul_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v12
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v8
-; GFX9-NEXT:    v_pk_mul_f16 v8, v1, v9
-; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v10
-; GFX9-NEXT:    v_pk_mul_f16 v6, v3, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v8
-; GFX10-NEXT:    v_pk_mul_f16 v8, v1, v9
-; GFX10-NEXT:    v_pk_mul_f16 v4, v2, v10
-; GFX10-NEXT:    v_pk_mul_f16 v6, v3, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <16 x bfloat> %a, %b
-  ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_mul_f32_e32 v5, v5, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_mul_f32_e32 v7, v7, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT:    v_mul_f32_e32 v9, v9, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT:    v_mul_f32_e32 v11, v11, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT:    v_mul_f32_e32 v13, v13, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_mul_f32_e32 v15, v15, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT:    v_mul_f32_e32 v17, v17, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT:    v_mul_f32_e32 v19, v19, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT:    v_mul_f32_e32 v21, v21, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT:    v_mul_f32_e32 v23, v23, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT:    v_mul_f32_e32 v25, v25, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT:    v_mul_f32_e32 v27, v27, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_mul_f32_e32 v29, v29, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_mul_f32_e32 v30, v30, v31
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT:    v_mul_f32_e32 v31, v31, v32
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_mul_f32_e32 v31, v31, v32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v24, v0, v16
-; GFX8-NEXT:    v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v25, v1, v17
-; GFX8-NEXT:    v_mul_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v26, v2, v18
-; GFX8-NEXT:    v_mul_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v27, v3, v19
-; GFX8-NEXT:    v_mul_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v8, v4, v20
-; GFX8-NEXT:    v_mul_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v10, v5, v21
-; GFX8-NEXT:    v_mul_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v12, v6, v22
-; GFX8-NEXT:    v_mul_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v14, v7, v23
-; GFX8-NEXT:    v_mul_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v24
-; GFX8-NEXT:    v_mov_b32_e32 v1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v2, v25
-; GFX8-NEXT:    v_mov_b32_e32 v3, v17
-; GFX8-NEXT:    v_mov_b32_e32 v4, v26
-; GFX8-NEXT:    v_mov_b32_e32 v5, v18
-; GFX8-NEXT:    v_mov_b32_e32 v6, v27
-; GFX8-NEXT:    v_mov_b32_e32 v7, v19
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v16
-; GFX9-NEXT:    v_pk_mul_f16 v16, v1, v17
-; GFX9-NEXT:    v_pk_mul_f16 v18, v2, v18
-; GFX9-NEXT:    v_pk_mul_f16 v17, v3, v19
-; GFX9-NEXT:    v_pk_mul_f16 v8, v4, v20
-; GFX9-NEXT:    v_pk_mul_f16 v10, v5, v21
-; GFX9-NEXT:    v_pk_mul_f16 v12, v6, v22
-; GFX9-NEXT:    v_pk_mul_f16 v14, v7, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-NEXT:    v_mov_b32_e32 v4, v18
-; GFX9-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v16
-; GFX10-NEXT:    v_pk_mul_f16 v16, v1, v17
-; GFX10-NEXT:    v_pk_mul_f16 v18, v2, v18
-; GFX10-NEXT:    v_pk_mul_f16 v17, v3, v19
-; GFX10-NEXT:    v_pk_mul_f16 v8, v4, v20
-; GFX10-NEXT:    v_pk_mul_f16 v10, v5, v21
-; GFX10-NEXT:    v_pk_mul_f16 v12, v6, v22
-; GFX10-NEXT:    v_pk_mul_f16 v14, v7, v23
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_mov_b32_e32 v2, v16
-; GFX10-NEXT:    v_mov_b32_e32 v4, v18
-; GFX10-NEXT:    v_mov_b32_e32 v6, v17
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <32 x bfloat> %a, %b
-  ret <32 x bfloat> %op
-}
-
-define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fdiv_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GCN-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GCN-NEXT:    v_rcp_f32_e32 v4, v2
-; GCN-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GCN-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GCN-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GCN-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GCN-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fdiv_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX7-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
-; GFX7-NEXT:    v_fma_f32 v3, v5, v3, v3
-; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
-; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fdiv_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fdiv bfloat %a, %b
-  ret bfloat %op
-}
-
-declare bfloat @llvm.fabs.bf16(bfloat)
-
-define bfloat @v_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-define bfloat @v_fneg_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fneg_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fneg_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fneg_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fneg_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fneg bfloat %a
-  ret bfloat %op
-}
-
-declare i32 @llvm.amdgcn.readfirstlane(i32)
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %op = fneg bfloat %a
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-define bfloat @v_fneg_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fneg_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fneg_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fneg_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fneg_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  ret bfloat %op
-}
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bitset1_b32 s0, 15
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_bitset1_b32 s0, 15
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bitset1_b32 s0, 15
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_bitset1_b32 s0, 15
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_bitset1_b32 s0, 15
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
-declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
-declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
-declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
-
-define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_minnum_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v3
-; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v3, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v1, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v3, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v1, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
-  ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT:    v_min_f32_e32 v6, v6, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_min_f32_e32 v7, v7, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT:    v_min_f16_e32 v6, v2, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v4, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v2, v5, v5
-; GFX8-NEXT:    v_min_f16_e32 v2, v0, v2
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v3, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v2, v5, v5
-; GFX9-NEXT:    v_pk_min_f16 v2, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
-  ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT:    v_min_f32_e32 v2, v2, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT:    v_min_f32_e32 v3, v3, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT:    v_min_f32_e32 v4, v4, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT:    v_min_f32_e32 v5, v5, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT:    v_min_f32_e32 v6, v6, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT:    v_min_f32_e32 v7, v7, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT:    v_min_f32_e32 v8, v8, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT:    v_min_f32_e32 v9, v9, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT:    v_min_f32_e32 v10, v10, v16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT:    v_min_f32_e32 v11, v11, v17
-; GCN-NEXT:    v_min_f32_e32 v12, v12, v18
-; GCN-NEXT:    v_min_f32_e32 v13, v13, v19
-; GCN-NEXT:    v_min_f32_e32 v14, v14, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT:    v_min_f32_e32 v6, v6, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT:    v_min_f32_e32 v7, v7, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT:    v_min_f32_e32 v8, v8, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT:    v_min_f32_e32 v9, v9, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT:    v_min_f32_e32 v10, v10, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT:    v_min_f32_e32 v11, v11, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT:    v_min_f32_e32 v12, v12, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_min_f32_e32 v13, v13, v18
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_min_f32_e32 v14, v14, v17
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_min_f32_e32 v15, v15, v16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v8, v8
-; GFX8-NEXT:    v_min_f16_e32 v12, v4, v5
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v8, v0, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v4, v9, v9
-; GFX8-NEXT:    v_min_f16_e32 v13, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v9, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v1, v10, v10
-; GFX8-NEXT:    v_min_f16_e32 v4, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v5, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v1, v11, v11
-; GFX8-NEXT:    v_min_f16_e32 v6, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v7, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v12
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v4, v8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v4, v9, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v8, v1, v4
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v2, v10, v10
-; GFX9-NEXT:    v_pk_min_f16 v4, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT:    v_pk_max_f16 v2, v11, v11
-; GFX9-NEXT:    v_pk_min_f16 v6, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v4, v8, v8
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v5, v9, v9
-; GFX10-NEXT:    v_pk_max_f16 v6, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v7, v10, v10
-; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v8, v11, v11
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX10-NEXT:    v_pk_min_f16 v2, v1, v5
-; GFX10-NEXT:    v_pk_min_f16 v4, v6, v7
-; GFX10-NEXT:    v_pk_min_f16 v6, v3, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
-  ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_min_f32_e32 v3, v3, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_min_f32_e32 v5, v5, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_min_f32_e32 v7, v7, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT:    v_min_f32_e32 v9, v9, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT:    v_min_f32_e32 v11, v11, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT:    v_min_f32_e32 v13, v13, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_min_f32_e32 v15, v15, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT:    v_min_f32_e32 v17, v17, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT:    v_min_f32_e32 v19, v19, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT:    v_min_f32_e32 v21, v21, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT:    v_min_f32_e32 v23, v23, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT:    v_min_f32_e32 v25, v25, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT:    v_min_f32_e32 v27, v27, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_min_f32_e32 v29, v29, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_min_f32_e32 v30, v30, v31
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT:    v_min_f32_e32 v31, v31, v32
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v5, v5, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v6, v6, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v7, v7, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v8, v8, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v9, v9, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v10, v10, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v11, v11, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v12, v12, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v13, v13, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v14, v14, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v15, v15, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v16, v16, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v17, v17, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v18, v18, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v19, v19, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v20, v20, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v21, v21, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v22, v22, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v23, v23, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v24, v24, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v25, v25, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v26, v26, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v27, v27, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v28, v28, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v29, v29, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v30, v30, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v8, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v9, v16, v16
-; GFX8-NEXT:    v_min_f16_e32 v24, v8, v9
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v16, v0, v8
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v8, v17, v17
-; GFX8-NEXT:    v_min_f16_e32 v25, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v17, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v1, v18, v18
-; GFX8-NEXT:    v_min_f16_e32 v26, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v18, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v1, v19, v19
-; GFX8-NEXT:    v_min_f16_e32 v27, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v19, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v1, v20, v20
-; GFX8-NEXT:    v_min_f16_e32 v8, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v9, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v21, v21
-; GFX8-NEXT:    v_min_f16_e32 v10, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v11, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v1, v22, v22
-; GFX8-NEXT:    v_min_f16_e32 v12, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v13, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v1, v23, v23
-; GFX8-NEXT:    v_min_f16_e32 v14, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v15, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v24
-; GFX8-NEXT:    v_mov_b32_e32 v1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v2, v25
-; GFX8-NEXT:    v_mov_b32_e32 v3, v17
-; GFX8-NEXT:    v_mov_b32_e32 v4, v26
-; GFX8-NEXT:    v_mov_b32_e32 v5, v18
-; GFX8-NEXT:    v_mov_b32_e32 v6, v27
-; GFX8-NEXT:    v_mov_b32_e32 v7, v19
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v8, v16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v8, v17, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v16, v1, v8
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v2, v18, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v18, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT:    v_pk_max_f16 v2, v19, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v17, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX9-NEXT:    v_pk_max_f16 v2, v20, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v8, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX9-NEXT:    v_pk_max_f16 v2, v21, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_min_f16 v10, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v6, v6
-; GFX9-NEXT:    v_pk_max_f16 v2, v22, v22
-; GFX9-NEXT:    v_pk_min_f16 v12, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v7, v7
-; GFX9-NEXT:    v_pk_max_f16 v2, v23, v23
-; GFX9-NEXT:    v_pk_min_f16 v14, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-NEXT:    v_mov_b32_e32 v4, v18
-; GFX9-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v8, v16, v16
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v9, v17, v17
-; GFX10-NEXT:    v_pk_max_f16 v10, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v11, v18, v18
-; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX10-NEXT:    v_pk_min_f16 v2, v1, v9
-; GFX10-NEXT:    v_pk_min_f16 v16, v10, v11
-; GFX10-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v19, v19
-; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v8, v20, v20
-; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-NEXT:    v_pk_max_f16 v9, v21, v21
-; GFX10-NEXT:    v_pk_max_f16 v11, v6, v6
-; GFX10-NEXT:    v_pk_max_f16 v12, v22, v22
-; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-NEXT:    v_pk_max_f16 v13, v23, v23
-; GFX10-NEXT:    v_pk_min_f16 v6, v1, v3
-; GFX10-NEXT:    v_pk_min_f16 v8, v4, v8
-; GFX10-NEXT:    v_pk_min_f16 v10, v5, v9
-; GFX10-NEXT:    v_pk_min_f16 v12, v11, v12
-; GFX10-NEXT:    v_pk_min_f16 v14, v7, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_mov_b32_e32 v4, v16
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
-  ret <32 x bfloat> %op
-}
-
-
-declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
-declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
-declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
-declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
-
-define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_maxnum_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v3, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v1, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
-  ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v8bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v8bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT:    v_max_f32_e32 v6, v6, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_max_f32_e32 v7, v7, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v8bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v6, v2, v3
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v2, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v0, v2
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v8bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v2, v5, v5
-; GFX9-NEXT:    v_pk_max_f16 v2, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v8bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v2, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v3, v5, v5
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
-  ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v16bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT:    v_max_f32_e32 v2, v2, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT:    v_max_f32_e32 v3, v3, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT:    v_max_f32_e32 v4, v4, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT:    v_max_f32_e32 v5, v5, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT:    v_max_f32_e32 v6, v6, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT:    v_max_f32_e32 v7, v7, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT:    v_max_f32_e32 v8, v8, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT:    v_max_f32_e32 v9, v9, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT:    v_max_f32_e32 v10, v10, v16
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT:    v_max_f32_e32 v11, v11, v17
-; GCN-NEXT:    v_max_f32_e32 v12, v12, v18
-; GCN-NEXT:    v_max_f32_e32 v13, v13, v19
-; GCN-NEXT:    v_max_f32_e32 v14, v14, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v16bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v16
-; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT:    v_max_f32_e32 v6, v6, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT:    v_max_f32_e32 v7, v7, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT:    v_max_f32_e32 v8, v8, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT:    v_max_f32_e32 v9, v9, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT:    v_max_f32_e32 v10, v10, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT:    v_max_f32_e32 v11, v11, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT:    v_max_f32_e32 v12, v12, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_max_f32_e32 v13, v13, v18
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_max_f32_e32 v14, v14, v17
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_max_f32_e32 v15, v15, v16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v16bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v5, v8, v8
-; GFX8-NEXT:    v_max_f16_e32 v12, v4, v5
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v8, v0, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v4, v9, v9
-; GFX8-NEXT:    v_max_f16_e32 v13, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v9, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v1, v10, v10
-; GFX8-NEXT:    v_max_f16_e32 v4, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v1, v11, v11
-; GFX8-NEXT:    v_max_f16_e32 v6, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v7, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v12
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v4, v8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v4, v9, v9
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v8, v1, v4
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v2, v10, v10
-; GFX9-NEXT:    v_pk_max_f16 v4, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT:    v_pk_max_f16 v2, v11, v11
-; GFX9-NEXT:    v_pk_max_f16 v6, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v4, v8, v8
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v5, v9, v9
-; GFX10-NEXT:    v_pk_max_f16 v6, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v7, v10, v10
-; GFX10-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v8, v11, v11
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v5
-; GFX10-NEXT:    v_pk_max_f16 v4, v6, v7
-; GFX10-NEXT:    v_pk_max_f16 v6, v3, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
-  ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v32bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT:    v_max_f32_e32 v3, v3, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT:    v_max_f32_e32 v5, v5, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_max_f32_e32 v7, v7, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT:    v_max_f32_e32 v9, v9, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT:    v_max_f32_e32 v11, v11, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT:    v_max_f32_e32 v13, v13, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_max_f32_e32 v15, v15, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT:    v_max_f32_e32 v17, v17, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT:    v_max_f32_e32 v19, v19, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT:    v_max_f32_e32 v21, v21, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT:    v_max_f32_e32 v23, v23, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT:    v_max_f32_e32 v25, v25, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT:    v_max_f32_e32 v27, v27, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_max_f32_e32 v29, v29, v31
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT:    v_max_f32_e32 v30, v30, v31
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT:    v_max_f32_e32 v31, v31, v32
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v32bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v5, v5, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v6, v6, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v7, v7, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v8, v8, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v9, v9, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v10, v10, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v11, v11, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v12, v12, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v13, v13, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v14, v14, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v15, v15, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v16, v16, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v17, v17, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v18, v18, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v19, v19, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v20, v20, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v21, v21, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v22, v22, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v23, v23, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v24, v24, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v25, v25, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v26, v26, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v27, v27, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v28, v28, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v29, v29, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v30, v30, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT:    v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v32bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v8, v0, v0
-; GFX8-NEXT:    v_max_f16_e32 v9, v16, v16
-; GFX8-NEXT:    v_max_f16_e32 v24, v8, v9
-; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v16, v0, v8
-; GFX8-NEXT:    v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT:    v_max_f16_e32 v8, v17, v17
-; GFX8-NEXT:    v_max_f16_e32 v25, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v17, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v1, v18, v18
-; GFX8-NEXT:    v_max_f16_e32 v26, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v18, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v1, v19, v19
-; GFX8-NEXT:    v_max_f16_e32 v27, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v19, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v1, v20, v20
-; GFX8-NEXT:    v_max_f16_e32 v8, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v9, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v1, v21, v21
-; GFX8-NEXT:    v_max_f16_e32 v10, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v11, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v1, v22, v22
-; GFX8-NEXT:    v_max_f16_e32 v12, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v13, v0, v1
-; GFX8-NEXT:    v_max_f16_e32 v0, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v1, v23, v23
-; GFX8-NEXT:    v_max_f16_e32 v14, v0, v1
-; GFX8-NEXT:    v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v15, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, v24
-; GFX8-NEXT:    v_mov_b32_e32 v1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v2, v25
-; GFX8-NEXT:    v_mov_b32_e32 v3, v17
-; GFX8-NEXT:    v_mov_b32_e32 v4, v26
-; GFX8-NEXT:    v_mov_b32_e32 v5, v18
-; GFX8-NEXT:    v_mov_b32_e32 v6, v27
-; GFX8-NEXT:    v_mov_b32_e32 v7, v19
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT:    v_pk_max_f16 v8, v16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT:    v_pk_max_f16 v8, v17, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v16, v1, v8
-; GFX9-NEXT:    v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT:    v_pk_max_f16 v2, v18, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v18, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT:    v_pk_max_f16 v2, v19, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v17, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v4, v4
-; GFX9-NEXT:    v_pk_max_f16 v2, v20, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v8, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v5, v5
-; GFX9-NEXT:    v_pk_max_f16 v2, v21, v21
-; GFX9-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_max_f16 v10, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v6, v6
-; GFX9-NEXT:    v_pk_max_f16 v2, v22, v22
-; GFX9-NEXT:    v_pk_max_f16 v12, v1, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v7, v7
-; GFX9-NEXT:    v_pk_max_f16 v2, v23, v23
-; GFX9-NEXT:    v_pk_max_f16 v14, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_mov_b32_e32 v2, v16
-; GFX9-NEXT:    v_mov_b32_e32 v4, v18
-; GFX9-NEXT:    v_mov_b32_e32 v6, v17
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v32bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT:    v_pk_max_f16 v8, v16, v16
-; GFX10-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT:    v_pk_max_f16 v9, v17, v17
-; GFX10-NEXT:    v_pk_max_f16 v10, v2, v2
-; GFX10-NEXT:    v_pk_max_f16 v11, v18, v18
-; GFX10-NEXT:    v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v9
-; GFX10-NEXT:    v_pk_max_f16 v16, v10, v11
-; GFX10-NEXT:    v_pk_max_f16 v1, v3, v3
-; GFX10-NEXT:    v_pk_max_f16 v3, v19, v19
-; GFX10-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-NEXT:    v_pk_max_f16 v8, v20, v20
-; GFX10-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-NEXT:    v_pk_max_f16 v9, v21, v21
-; GFX10-NEXT:    v_pk_max_f16 v11, v6, v6
-; GFX10-NEXT:    v_pk_max_f16 v12, v22, v22
-; GFX10-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-NEXT:    v_pk_max_f16 v13, v23, v23
-; GFX10-NEXT:    v_pk_max_f16 v6, v1, v3
-; GFX10-NEXT:    v_pk_max_f16 v8, v4, v8
-; GFX10-NEXT:    v_pk_max_f16 v10, v5, v9
-; GFX10-NEXT:    v_pk_max_f16 v12, v11, v12
-; GFX10-NEXT:    v_pk_max_f16 v14, v7, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT:    v_mov_b32_e32 v4, v16
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
-  ret <32 x bfloat> %op
-}
-
-declare bfloat @llvm.sqrt.bf16(bfloat)
-
-define bfloat @v_sqrt_bf16(bfloat %a) {
-; GCN-LABEL: v_sqrt_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_sqrt_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_sqrt_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_sqrt_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_sqrt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_sqrt_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
-
-define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
-; GCN-LABEL: v_ldexp_bf16_i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_ldexp_bf16_i32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_ldexp_bf16_i32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX8-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX8-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_ldexp_bf16_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_ldexp_bf16_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
-  ret bfloat %op
-}
-
-declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
-
-define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
-; GCN-LABEL: v_frexp_bf16_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GCN-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GCN-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_frexp_bf16_i16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_frexp_bf16_i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_frexp_bf16_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_frexp_bf16_i16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX10-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
-  ret { bfloat, i16 } %op
-}
-
-
-declare bfloat @llvm.log.bf16(bfloat)
-declare bfloat @llvm.log2.bf16(bfloat)
-declare bfloat @llvm.log10.bf16(bfloat)
-
-define bfloat @v_log_bf16(bfloat %a) {
-; GCN-LABEL: v_log_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_log_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_log_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.log.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-define bfloat @v_log2_bf16(bfloat %a) {
-; GCN-LABEL: v_log2_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_log_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log2_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_log_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log2_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log2_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log2_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.log2.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-define bfloat @v_log10_bf16(bfloat %a) {
-; GCN-LABEL: v_log10_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_log_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log10_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_log_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log10_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_log_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log10_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_log_f16_e32 v0, v0
-; GFX9-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log10_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_log_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.log10.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.exp.bf16(bfloat)
-declare bfloat @llvm.exp2.bf16(bfloat)
-declare bfloat @llvm.exp10.bf16(bfloat)
-
-define bfloat @v_exp_bf16(bfloat %a) {
-; GCN-LABEL: v_exp_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.exp.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-define bfloat @v_exp2_bf16(bfloat %a) {
-; GCN-LABEL: v_exp2_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp2_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp2_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_exp_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp2_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_exp_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp2_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_exp_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.exp2.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-define bfloat @v_exp10_bf16(bfloat %a) {
-; GCN-LABEL: v_exp10_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp10_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp10_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp10_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp10_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.exp10.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.ceil.bf16(bfloat)
-
-define bfloat @v_ceil_bf16(bfloat %a) {
-; GCN-LABEL: v_ceil_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_ceil_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_ceil_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_ceil_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ceil_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_ceil_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ceil_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_ceil_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ceil_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.ceil.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.trunc.bf16(bfloat)
-
-define bfloat @v_trunc_bf16(bfloat %a) {
-; GCN-LABEL: v_trunc_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_trunc_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_trunc_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_trunc_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_trunc_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_trunc_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.trunc.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.rint.bf16(bfloat)
-
-define bfloat @v_rint_bf16(bfloat %a) {
-; GCN-LABEL: v_rint_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_rndne_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_rint_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_rint_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_rint_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_rint_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.rint.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.nearbyint.bf16(bfloat)
-
-; FIXME: unable to legalize instruction: %2:_(s16) = G_FNEARBYINT %0:_
-; define bfloat @v_nearbyint_bf16(bfloat %a) {
-;   %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
-;   ret bfloat %op
-; }
-
-declare bfloat @llvm.round.bf16(bfloat)
-
-define bfloat @v_round_bf16(bfloat %a) {
-; GCN-LABEL: v_round_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, 0.5
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v4, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e64 v5, -v4
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_round_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX7-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v3, -v2
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, 0.5
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v1, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_round_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX8-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX8-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_round_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_round_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.round.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.roundeven.bf16(bfloat)
-
-define bfloat @v_roundeven_bf16(bfloat %a) {
-; GCN-LABEL: v_roundeven_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_rndne_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_roundeven_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_roundeven_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_roundeven_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_roundeven_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.floor.bf16(bfloat)
-
-define bfloat @v_floor_bf16(bfloat %a) {
-; GCN-LABEL: v_floor_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_floor_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_floor_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_floor_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_floor_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_floor_f16_e32 v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_floor_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_floor_f16_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_floor_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_floor_f16_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.floor.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.canonicalize.bf16(bfloat)
-
-define bfloat @v_canonicalize_bf16(bfloat %a) {
-; GCN-LABEL: v_canonicalize_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_canonicalize_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_canonicalize_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_canonicalize_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_canonicalize_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
-  ret bfloat %op
-}
-
-declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
-
-; FIXME: Promotion broken
-; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
-;   %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
-;   ret bfloat %op
-; }
-
-define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_false_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_false_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_false_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_false_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_false_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp false bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_oeq_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_oeq_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_oeq_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_oeq_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_oeq_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp oeq bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ogt_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ogt_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ogt_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ogt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_gt_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ogt_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ogt bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_oge_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_oge_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_oge_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ge_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_oge_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_oge_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp oge bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_olt_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_olt_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_olt_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_olt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_olt_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp olt bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ole_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ole_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ole_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_le_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ole_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_le_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ole_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ole bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_one_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_one_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_one_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_lg_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_one_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lg_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_one_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp one bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_uno_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_uno_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_uno_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_u_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_uno_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_u_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_uno_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp uno bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ueq_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ueq_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ueq_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_nlg_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ueq_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlg_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ueq_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ueq bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ugt_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ugt_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ugt_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ugt_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ugt_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ugt bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_uge_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_uge_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_uge_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_uge_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_uge_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp uge bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ult_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ult_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ult_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_nge_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ult_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ult_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ult bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ule_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ule_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ule_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ule_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ule_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp ule bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_une_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_une_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_une_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cmp_neq_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_une_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_une_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp une bfloat %a, %b
-  ret i1 %op
-}
-
-define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_true_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_true_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, 1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_true_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_true_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_true_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = fcmp true bfloat %a, %b
-  ret i1 %op
-}
-
-declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
-
-define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
-; GCN-LABEL: v_copysign_bf16_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_bf16_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_bf16_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_bf16_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_bf16_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-  ret bfloat %op
-}
-
-; FIXME: unable to lower arguments: ptr
-; define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   ret bfloat %op
-; }
-
-; FIXME: unable to lower arguments: ptr
-; define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
-;   %sign = fptrunc float %sign.f32 to bfloat
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
-;   %sign = fptrunc double %sign.f64 to bfloat
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   ret bfloat %op
-; }
-
-define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
-; GCN-LABEL: v_copysign_bf16_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_bf16_f16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_bf16_f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_bf16_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_bf16_f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %sign = bitcast half %sign.f16 to bfloat
-  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-  ret bfloat %op
-}
-
-define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
-; GCN-LABEL: s_copysign_bf16_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_bf16_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_bf16_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_bf16_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_bf16_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-; FIXME: unable to translate instruction: fptrunc
-; define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
-;   %sign = fptrunc float %sign.f32 to bfloat
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   %cast = bitcast bfloat %op to i16
-;   %zext = zext i16 %cast to i32
-;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-;   ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
-;   %sign = fptrunc double %sign.f64 to bfloat
-;   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-;   %cast = bitcast bfloat %op to i16
-;   %zext = zext i16 %cast to i32
-;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-;   ret i32 %readlane
-; }
-
-define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
-; GCN-LABEL: s_copysign_bf16_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_bf16_f16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_bf16_f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_bf16_f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_bf16_f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %sign = bitcast half %sign.f16 to bfloat
-  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-declare float @llvm.copysign.f32(float, float)
-
-; FIXME: unable to translate instruction: fpext
-; define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
-;   %sign = fpext bfloat %sign.bf16 to float
-;   %op = call float @llvm.copysign.f32(float %mag, float %sign)
-;   ret float %op
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
-;   %sign = fpext bfloat %sign.bf16 to float
-;   %op = call float @llvm.copysign.f32(float %mag, float %sign)
-;   %cast = bitcast float %op to i32
-;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-;   ret i32 %readlane
-; }
-
-declare half @llvm.copysign.f16(half, half)
-
-define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
-; GCN-LABEL: v_copysign_f16_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_f16_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_f16_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_f16_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_f16_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %sign = bitcast bfloat %sign.bf16 to half
-  %op = call half @llvm.copysign.f16(half %mag, half %sign)
-  ret half %op
-}
-
-define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
-; GCN-LABEL: s_copysign_f16_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_and_b32 s0, 0xffff, s0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_f16_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_f16_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_f16_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_f16_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    ; return to shader part epilog
-  %sign = bitcast bfloat %sign.bf16 to half
-  %op = call half @llvm.copysign.f16(half %mag, half %sign)
-  %cast = bitcast half %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-declare double @llvm.copysign.f64(double, double)
-
-; FIXME: unable to translate instruction: fpext
-; define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
-;   %sign = fpext bfloat %sign.bf16 to double
-;   %op = call double @llvm.copysign.f64(double %mag, double %sign)
-;   ret double %op
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
-;   %sign = fpext bfloat %sign.bf16 to double
-;   %op = call double @llvm.copysign.f64(double %mag, double %sign)
-;   %cast = bitcast double %op to <2 x i32>
-;   %cast.0 = extractelement <2 x i32> %cast, i32 0
-;   %cast.1 = extractelement <2 x i32> %cast, i32 1
-;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
-;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
-;   %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-;   %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
-;   ret <2 x i32> %ins.1
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
-;   %op = fptosi bfloat %x to i16
-;   ret i16 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
-;   %op = fptosi <2 x bfloat> %x to <2 x i16>
-;   ret <2 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
-;   %op = fptosi <3 x bfloat> %x to <3 x i16>
-;   ret <3 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
-;   %op = fptosi <4 x bfloat> %x to <4 x i16>
-;   ret <4 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
-;   %op = fptosi bfloat %x to i32
-;   ret i32 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
-;   %op = fptosi <2 x bfloat> %x to <2 x i32>
-;   ret <2 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
-;   %op = fptosi <3 x bfloat> %x to <3 x i32>
-;   ret <3 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
-;   %op = fptosi <4 x bfloat> %x to <4 x i32>
-;   ret <4 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
-;   %op = fptosi bfloat %x to i64
-;   ret i64 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
-;   %op = fptosi <2 x bfloat> %x to <2 x i64>
-;   ret <2 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
-;   %op = fptosi <3 x bfloat> %x to <3 x i64>
-;   ret <3 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
-;   %op = fptosi <4 x bfloat> %x to <4 x i64>
-;   ret <4 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
-;   %op = sitofp i16 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
-;   %op = sitofp <2 x i16> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
-;   %op = sitofp <3 x i16> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
-;   %op = sitofp <4 x i16> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
-;   %op = sitofp i32 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
-;   %op = sitofp <2 x i32> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
-;   %op = sitofp <3 x i32> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
-;   %op = sitofp <4 x i32> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
-;   %op = sitofp i64 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
-;   %op = sitofp <2 x i64> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
-;   %op = sitofp <3 x i64> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
-;   %op = sitofp <4 x i64> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
-;   %op = uitofp i16 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
-;   %op = uitofp <2 x i16> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
-;   %op = uitofp <3 x i16> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
-;   %op = uitofp <4 x i16> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
-;   %op = uitofp i32 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
-;   %op = uitofp <2 x i32> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
-;   %op = uitofp <3 x i32> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
-;   %op = uitofp <4 x i32> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
-;   %op = uitofp i64 %x to bfloat
-;   ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
-;   %op = uitofp <2 x i64> %x to <2 x bfloat>
-;   ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
-;   %op = uitofp <3 x i64> %x to <3 x bfloat>
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
-;   %op = uitofp <4 x i64> %x to <4 x bfloat>
-;   ret <4 x bfloat> %op
-; }
-
-define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = select i1 %cond, bfloat %a, bfloat %b
-  ret bfloat %op
-}
-
-define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_fneg_lhs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_fneg_lhs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_fneg_lhs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_fneg_lhs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_fneg_lhs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %neg.a = fneg bfloat %a
-  %op = select i1 %cond, bfloat %neg.a, bfloat %b
-  ret bfloat %op
-}
-
-define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_fneg_rhs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_fneg_rhs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_fneg_rhs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_fneg_rhs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_fneg_rhs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %neg.b = fneg bfloat %b
-  %op = select i1 %cond, bfloat %a, bfloat %neg.b
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_select_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v4, v3
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
-  ret <2 x bfloat> %op
-}
-
-define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_vselect_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_vselect_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_vselect_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_vselect_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_vselect_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
-  ret <2 x bfloat> %op
-}
-
-define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
-; GCN-LABEL: s_select_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_select_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_select_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_select_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_select_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
-  %cond = icmp eq i32 %c, 0
-  %op = select i1 %cond, bfloat %a, bfloat %b
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
-;   %cond = icmp eq i32 %c, 0
-;   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
-;   %cast = bitcast <2 x bfloat> %op to i32
-;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-;   ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
-;   %cond = icmp eq <2 x i32> %c, zeroinitializer
-;   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
-;   %cast = bitcast <2 x bfloat> %op to i32
-;   %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-;   ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
-;   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
-;   ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
-;   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
-;   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
-;   ret <6 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
-;   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
-;   ret <8 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
-;   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
-;   ret <16 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
-;   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
-;   ret <32 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
-;   %cond = icmp eq i32 %c, 0
-;   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
-;   %cast = bitcast <3 x bfloat> %op to i48
-;   %elt0 = trunc i48 %cast to i32
-;   %elt1.hi = lshr i48 %cast, 32
-;   %elt1 = trunc i48 %elt1.hi to i32
-;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-;   ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
-;   %cond = icmp eq i32 %c, 0
-;   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
-;   %cast = bitcast <4 x bfloat> %op to <2 x i32>
-;   %elt0 = extractelement <2 x i32> %cast, i32 0
-;   %elt1 = extractelement <2 x i32> %cast, i32 1
-;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-;   ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
-;   %cond = icmp eq <4 x i32> %c, zeroinitializer
-;   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
-;   %cast = bitcast <4 x bfloat> %op to <2 x i32>
-;   %elt0 = extractelement <2 x i32> %cast, i32 0
-;   %elt1 = extractelement <2 x i32> %cast, i32 1
-;   %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-;   %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-;   %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-;   %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-;   ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
-;   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
-;   ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
-;   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
-;   ret <8 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
-;   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
-;   ret <16 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
-;   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
-;   ret <32 x bfloat> %op
-; }
-
-declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
-declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
-
-define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
-; GCN-LABEL: v_fma_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
-; GCN-LABEL: v_fma_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
-; GCN-LABEL: v_fma_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
-; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
-; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_fma_f32 v0, v0, v3, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v4
-; GFX7-NEXT:    v_fma_f32 v2, v2, v5, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v4, v4
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_bfi_b32 v2, 0xffff, v4, v4
-; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
-; GCN-LABEL: v_fma_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
-; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
-; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT:    v_fma_f32 v0, v0, v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v11
-; GFX7-NEXT:    v_fma_f32 v2, v2, v4, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_fma_f32 v3, v3, v6, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
-  ret <4 x bfloat> %op
-}
-
-declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
-declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
-
-define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
-; GCN-LABEL: v_fmuladd_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v2
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX8-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
-  ret bfloat %op
-}
-
-define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v2bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v3, v0, v1
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v1, v3, v2
-; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
-  ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v7
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v3bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v7
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v8
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v3bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v1, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v4
-; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v3bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v4
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v3bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v4, v4
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
-  ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v4bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v4bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v8
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v9
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v10
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v4bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mul_f16_e32 v1, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v0, v1, v4
-; GFX8-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT:    v_mov_b32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
-  ret <4 x bfloat> %op
-}

>From e0ff9aaf19fbab39da390de298d857be07f090ed Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 14:22:10 +0100
Subject: [PATCH 5/6] Comments

---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 5b6dc0e5e20c40..5f4c59f6c55887 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -485,8 +485,7 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
     if (NumElts == Regs.size())
       BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
     else {
-      SmallVector<Register, 0> BVRegs;
-      BVRegs.reserve(NumElts);
+      SmallVector<Register, 0> BVRegs(NumElts);
 
       // Vector elements are packed in the inputs.
       // e.g. we have a <4 x s16> but 2 x s32 in regs.
@@ -507,8 +506,9 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
           BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
       }
 
-      // We may have some more elements in BVRegs, e.g. if we have 2 s32 pieces for a <3 x s16> vector. We should have less than EltPerReg extra items.
-      if(BVRegs.size() > NumElts) {
+      // We may have some more elements in BVRegs, e.g. if we have 2 s32 pieces
+      // for a <3 x s16> vector. We should have less than EltPerReg extra items.
+      if (BVRegs.size() > NumElts) {
         assert((BVRegs.size() - NumElts) < EltPerReg);
         BVRegs.truncate(NumElts);
       }

>From 85a9835586542ba9e8af514b3c7cf315c3cd09ba Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 16 Jan 2024 10:02:53 +0100
Subject: [PATCH 6/6] Use reserve()

---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 5f4c59f6c55887..ccd9b13d730b60 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -485,8 +485,6 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
     if (NumElts == Regs.size())
       BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
     else {
-      SmallVector<Register, 0> BVRegs(NumElts);
-
       // Vector elements are packed in the inputs.
       // e.g. we have a <4 x s16> but 2 x s32 in regs.
       assert(NumElts > Regs.size());
@@ -500,6 +498,8 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
       unsigned EltPerReg =
           (SrcEltTy.getSizeInBits() / OriginalEltTy.getSizeInBits());
 
+      SmallVector<Register, 0> BVRegs;
+      BVRegs.reserve(Regs.size() * EltPerReg);
       for (Register R : Regs) {
         auto Unmerge = B.buildUnmerge(OriginalEltTy, R);
         for (unsigned K = 0; K < EltPerReg; ++K)



More information about the llvm-commits mailing list