[llvm] [GlobalISel] Fix buildCopyFromRegs for split vectors (PR #77448)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 9 05:08:36 PST 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/77448
>From 6409bf5c6d454568a79d149e383ba6f857f47fb2 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 12:47:47 +0100
Subject: [PATCH 1/4] [GlobalISel] Fix buildCopyFromRegs for split vectors
Fixes #77055
---
llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 34 ++++++-
llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 96 ++++++++++++++++++++
2 files changed, 127 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 6858e030c2c75e..1e3c5d5d8007b1 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -478,9 +478,37 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
} else {
// Vector was split, and elements promoted to a wider type.
// FIXME: Should handle floating point promotions.
- LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
- auto BV = B.buildBuildVector(BVType, Regs);
- B.buildTrunc(OrigRegs[0], BV);
+ unsigned NumElts = LLTy.getNumElements();
+ LLT BVType = LLT::fixed_vector(NumElts, PartLLT);
+
+ Register BuildVec;
+ if (NumElts == Regs.size())
+ BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
+ else {
+ SmallVector<Register, 0> BVRegs;
+ BVRegs.reserve(NumElts);
+
+ // Vector elements are packed in the inputs.
+ // e.g. we have a <4 x s16> but 2 x s32 in regs.
+ assert(NumElts > Regs.size());
+ LLT SrcEltTy = MRI.getType(Regs[0]);
+ LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
+
+ // Input registers contain packed elements.
+ // Determine how many elements per reg.
+ assert((SrcEltTy.getSizeInBits() % OriginalEltTy.getSizeInBits()) == 0);
+ unsigned EltPerReg =
+ (SrcEltTy.getSizeInBits() / OriginalEltTy.getSizeInBits());
+
+ for (Register R : Regs) {
+ auto Unmerge = B.buildUnmerge(OriginalEltTy, R);
+ for (unsigned K = 0; K < EltPerReg; ++K)
+ BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
+ }
+ assert(BVRegs.size() == NumElts);
+ BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
+ }
+ B.buildTrunc(OrigRegs[0], BuildVec);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
new file mode 100644
index 00000000000000..3037b84b25775a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+
+; TODO: expand testcases - currently only contains cases that were known to crash.
+
+; assert in IRTranslator, #77055
+define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
+; GCN-LABEL: v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_or_b32_e32 v3, v1, v0
+; GCN-NEXT: v_or_b32_e32 v2, v4, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x bfloat> %res
+}
>From 3c2c6b852ebad918395b88323e6e78a2ff6c32ed Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 13:52:49 +0100
Subject: [PATCH 2/4] Fix v3bf16 cases + improve testing
---
llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 8 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 13814 ++++++++++++++++-
2 files changed, 13761 insertions(+), 61 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 1e3c5d5d8007b1..782d0760b94dea 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -492,6 +492,7 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
// e.g. we have a <4 x s16> but 2 x s32 in regs.
assert(NumElts > Regs.size());
LLT SrcEltTy = MRI.getType(Regs[0]);
+
LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
// Input registers contain packed elements.
@@ -505,7 +506,12 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
for (unsigned K = 0; K < EltPerReg; ++K)
BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
}
- assert(BVRegs.size() == NumElts);
+
+ // We may have some more elements in BVRegs, e.g. if we have 2 s32 pieces for a <3 x s16> vector. We should have less than EltPerReg extra items.
+ if(BVRegs.size() > NumElts) {
+ assert((BVRegs.size() - NumElts) < EltPerReg);
+ BVRegs.truncate(NumElts);
+ }
BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
}
B.buildTrunc(OrigRegs[0], BuildVec);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
index 3037b84b25775a..aaefb634b132aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -4,93 +4,13787 @@
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
-; TODO: expand testcases - currently only contains cases that were known to crash.
+; FIXME: GFX11 cannot select some truncs: %0:vgpr_32(s16) = G_TRUNC %1:vgpr_32(s32)
+; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
-; assert in IRTranslator, #77055
-define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
-; GCN-LABEL: v4bf16:
+define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load bfloat, ptr addrspace(1) %in
+ store bfloat %val, ptr addrspace(1) %out
+ ret void
+}
+
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+ ret <2 x bfloat> %load
+}
+
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v2, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+ ret <3 x bfloat> %load
+}
+
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v2, v5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+ ret <4 x bfloat> %load
+}
+
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GCN-NEXT: v_mov_b32_e32 v0, v6
+; GCN-NEXT: v_mov_b32_e32 v2, v7
+; GCN-NEXT: v_mov_b32_e32 v4, v8
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v6bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT: v_mov_b32_e32 v0, v6
+; GFX7-NEXT: v_mov_b32_e32 v2, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v8
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v6bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx3 v[2:4], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v6bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx3 v[2:4], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v6bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx3 v[2:4], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+ ret <6 x bfloat> %load
+}
+
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v2, v9
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v6, v11
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v4, v10
+; GFX7-NEXT: v_mov_b32_e32 v6, v11
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+ ret <8 x bfloat> %load
+}
+
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[23:26], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22
+; GCN-NEXT: v_mov_b32_e32 v0, v23
+; GCN-NEXT: v_mov_b32_e32 v2, v24
+; GCN-NEXT: v_mov_b32_e32 v4, v25
+; GCN-NEXT: v_mov_b32_e32 v6, v26
+; GCN-NEXT: v_mov_b32_e32 v8, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v12, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; GFX7-NEXT: v_mov_b32_e32 v0, v22
+; GFX7-NEXT: v_mov_b32_e32 v2, v23
+; GFX7-NEXT: v_mov_b32_e32 v4, v24
+; GFX7-NEXT: v_mov_b32_e32 v6, v25
+; GFX7-NEXT: v_mov_b32_e32 v8, v18
+; GFX7-NEXT: v_mov_b32_e32 v10, v19
+; GFX7-NEXT: v_mov_b32_e32 v12, v20
+; GFX7-NEXT: v_mov_b32_e32 v14, v21
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; GFX8-NEXT: v_mov_b32_e32 v0, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v4, v10
+; GFX8-NEXT: v_mov_b32_e32 v6, v11
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; GFX9-NEXT: v_mov_b32_e32 v0, v8
+; GFX9-NEXT: v_mov_b32_e32 v2, v9
+; GFX9-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v11
+; GFX10-NEXT: v_mov_b32_e32 v0, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v4, v10
+; GFX10-NEXT: v_mov_b32_e32 v6, v11
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+ ret <16 x bfloat> %load
+}
+
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[39:42], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v34
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v39
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v40
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v41
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v42
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v48
+; GCN-NEXT: buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v50
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GCN-NEXT: v_mov_b32_e32 v0, v34
+; GCN-NEXT: v_mov_b32_e32 v2, v35
+; GCN-NEXT: v_mov_b32_e32 v4, v36
+; GCN-NEXT: v_mov_b32_e32 v6, v37
+; GCN-NEXT: v_mov_b32_e32 v8, v39
+; GCN-NEXT: v_mov_b32_e32 v10, v40
+; GCN-NEXT: v_mov_b32_e32 v12, v41
+; GCN-NEXT: v_mov_b32_e32 v14, v42
+; GCN-NEXT: v_mov_b32_e32 v16, v48
+; GCN-NEXT: v_mov_b32_e32 v18, v49
+; GCN-NEXT: v_mov_b32_e32 v20, v50
+; GCN-NEXT: v_mov_b32_e32 v22, v51
+; GCN-NEXT: v_mov_b32_e32 v24, v52
+; GCN-NEXT: v_mov_b32_e32 v26, v53
+; GCN-NEXT: v_mov_b32_e32 v28, v54
+; GCN-NEXT: v_mov_b32_e32 v30, v55
+; GCN-NEXT: v_mov_b32_e32 v1, v38
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[38:41], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v40
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v41
+; GFX7-NEXT: v_mov_b32_e32 v4, v40
+; GFX7-NEXT: v_mov_b32_e32 v6, v41
+; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v39
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v49
+; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v51
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v34
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v35
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v37
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX7-NEXT: v_mov_b32_e32 v0, v38
+; GFX7-NEXT: v_mov_b32_e32 v2, v39
+; GFX7-NEXT: v_mov_b32_e32 v8, v48
+; GFX7-NEXT: v_mov_b32_e32 v10, v49
+; GFX7-NEXT: v_mov_b32_e32 v12, v50
+; GFX7-NEXT: v_mov_b32_e32 v14, v51
+; GFX7-NEXT: v_mov_b32_e32 v16, v34
+; GFX7-NEXT: v_mov_b32_e32 v18, v35
+; GFX7-NEXT: v_mov_b32_e32 v20, v36
+; GFX7-NEXT: v_mov_b32_e32 v22, v37
+; GFX7-NEXT: v_mov_b32_e32 v24, v52
+; GFX7-NEXT: v_mov_b32_e32 v26, v53
+; GFX7-NEXT: v_mov_b32_e32 v28, v54
+; GFX7-NEXT: v_mov_b32_e32 v30, v55
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[22:25], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[18:21], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v25
+; GFX8-NEXT: v_mov_b32_e32 v0, v22
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; GFX8-NEXT: v_mov_b32_e32 v2, v23
+; GFX8-NEXT: v_mov_b32_e32 v4, v24
+; GFX8-NEXT: v_mov_b32_e32 v6, v25
+; GFX8-NEXT: v_mov_b32_e32 v8, v18
+; GFX8-NEXT: v_mov_b32_e32 v10, v19
+; GFX8-NEXT: v_mov_b32_e32 v12, v20
+; GFX8-NEXT: v_mov_b32_e32 v14, v21
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[22:25], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v25
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; GFX9-NEXT: v_mov_b32_e32 v0, v22
+; GFX9-NEXT: v_mov_b32_e32 v2, v23
+; GFX9-NEXT: v_mov_b32_e32 v4, v24
+; GFX9-NEXT: v_mov_b32_e32 v6, v25
+; GFX9-NEXT: v_mov_b32_e32 v8, v18
+; GFX9-NEXT: v_mov_b32_e32 v10, v19
+; GFX9-NEXT: v_mov_b32_e32 v12, v20
+; GFX9-NEXT: v_mov_b32_e32 v14, v21
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[22:25], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v25
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v21
+; GFX10-NEXT: v_mov_b32_e32 v0, v22
+; GFX10-NEXT: v_mov_b32_e32 v2, v23
+; GFX10-NEXT: v_mov_b32_e32 v4, v24
+; GFX10-NEXT: v_mov_b32_e32 v6, v25
+; GFX10-NEXT: v_mov_b32_e32 v8, v18
+; GFX10-NEXT: v_mov_b32_e32 v10, v19
+; GFX10-NEXT: v_mov_b32_e32 v12, v20
+; GFX10-NEXT: v_mov_b32_e32 v14, v21
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+ ret <32 x bfloat> %load
+}
+
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 4, v0
+; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v23, v21, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v24, v21, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 24, v0
+; GCN-NEXT: s_waitcnt vmcnt(10)
+; GCN-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0
+; GCN-NEXT: buffer_store_dword v26, v22, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 32, v0
+; GCN-NEXT: buffer_store_dword v27, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 36, v0
+; GCN-NEXT: buffer_store_dword v28, v21, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 44, v0
+; GCN-NEXT: s_waitcnt expcnt(3)
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0
+; GCN-NEXT: s_waitcnt vmcnt(13)
+; GCN-NEXT: buffer_store_dword v29, v22, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0
+; GCN-NEXT: buffer_store_dword v30, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0
+; GCN-NEXT: buffer_store_dword v31, v21, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v21, 0x44
+; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v24, 0x48
+; GCN-NEXT: s_waitcnt expcnt(4)
+; GCN-NEXT: v_mov_b32_e32 v28, 0x4c
+; GCN-NEXT: s_waitcnt expcnt(3)
+; GCN-NEXT: v_mov_b32_e32 v29, 0x50
+; GCN-NEXT: s_waitcnt expcnt(2)
+; GCN-NEXT: v_mov_b32_e32 v30, 0x54
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mov_b32_e32 v31, 0x58
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v17, 0x5c
+; GCN-NEXT: buffer_store_dword v18, v26, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v18, 0x60
+; GCN-NEXT: buffer_store_dword v19, v27, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v19, 0x64
+; GCN-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v20, 0x68
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x6c, v0
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x74, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v21, vcc, v0, v21
+; GCN-NEXT: v_add_i32_e32 v24, vcc, v0, v24
+; GCN-NEXT: v_add_i32_e32 v28, vcc, v0, v28
+; GCN-NEXT: v_add_i32_e32 v29, vcc, v0, v29
+; GCN-NEXT: v_add_i32_e32 v30, vcc, v0, v30
+; GCN-NEXT: v_add_i32_e32 v31, vcc, v0, v31
+; GCN-NEXT: v_add_i32_e32 v17, vcc, v0, v17
+; GCN-NEXT: v_add_i32_e32 v18, vcc, v0, v18
+; GCN-NEXT: v_add_i32_e32 v19, vcc, v0, v19
+; GCN-NEXT: v_add_i32_e32 v20, vcc, v0, v20
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; GCN-NEXT: buffer_store_dword v13, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v21, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v17, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_store_dword v5, v18, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v7, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v27, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v64bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, 4, v0
+; GFX7-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v23, v21, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, 12, v0
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v24, v21, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 20, v0
+; GFX7-NEXT: buffer_store_dword v26, v23, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 24, v0
+; GFX7-NEXT: buffer_store_dword v27, v23, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 28, v0
+; GFX7-NEXT: v_add_i32_e32 v26, vcc, 32, v0
+; GFX7-NEXT: buffer_store_dword v28, v23, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v27, vcc, 36, v0
+; GFX7-NEXT: s_waitcnt vmcnt(13)
+; GFX7-NEXT: buffer_store_dword v29, v26, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v26, vcc, 40, v0
+; GFX7-NEXT: v_mov_b32_e32 v21, 0x44
+; GFX7-NEXT: v_mov_b32_e32 v22, 0x48
+; GFX7-NEXT: v_mov_b32_e32 v23, 0x4c
+; GFX7-NEXT: v_mov_b32_e32 v24, 0x50
+; GFX7-NEXT: v_mov_b32_e32 v25, 0x54
+; GFX7-NEXT: buffer_store_dword v30, v27, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v27, vcc, 44, v0
+; GFX7-NEXT: buffer_store_dword v31, v26, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v26, vcc, 48, v0
+; GFX7-NEXT: buffer_store_dword v32, v27, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v27, vcc, 52, v0
+; GFX7-NEXT: v_add_i32_e32 v28, vcc, 56, v0
+; GFX7-NEXT: v_add_i32_e32 v29, vcc, 60, v0
+; GFX7-NEXT: v_add_i32_e32 v30, vcc, 64, v0
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, v0, v21
+; GFX7-NEXT: v_add_i32_e32 v22, vcc, v0, v22
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, v0, v23
+; GFX7-NEXT: v_add_i32_e32 v24, vcc, v0, v24
+; GFX7-NEXT: v_add_i32_e32 v25, vcc, v0, v25
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v15, v28, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v16, v29, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v30, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v9, 0x58
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v9, 0x5c
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v9, 0x60
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x64
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x68
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0
+; GFX7-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0
+; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0
+; GFX7-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
+; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v64bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT: flat_load_dwordx4 v[38:41], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[48:51], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[34:37], v[2:3]
+; GFX8-NEXT: flat_load_dwordx4 v[52:55], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v40
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v41
+; GFX8-NEXT: v_mov_b32_e32 v4, v40
+; GFX8-NEXT: v_mov_b32_e32 v6, v41
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v39
+; GFX8-NEXT: s_waitcnt vmcnt(4)
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v49
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v50
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v51
+; GFX8-NEXT: v_mov_b32_e32 v0, v38
+; GFX8-NEXT: v_mov_b32_e32 v2, v39
+; GFX8-NEXT: v_mov_b32_e32 v8, v48
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v34
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v35
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v37
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX8-NEXT: v_mov_b32_e32 v10, v49
+; GFX8-NEXT: v_mov_b32_e32 v12, v50
+; GFX8-NEXT: v_mov_b32_e32 v14, v51
+; GFX8-NEXT: v_mov_b32_e32 v16, v34
+; GFX8-NEXT: v_mov_b32_e32 v18, v35
+; GFX8-NEXT: v_mov_b32_e32 v20, v36
+; GFX8-NEXT: v_mov_b32_e32 v22, v37
+; GFX8-NEXT: v_mov_b32_e32 v24, v52
+; GFX8-NEXT: v_mov_b32_e32 v26, v53
+; GFX8-NEXT: v_mov_b32_e32 v28, v54
+; GFX8-NEXT: v_mov_b32_e32 v30, v55
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v64bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: global_load_dwordx4 v[38:41], v[0:1], off
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[34:37], v[0:1], off offset:32
+; GFX9-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v40
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v41
+; GFX9-NEXT: v_mov_b32_e32 v4, v40
+; GFX9-NEXT: v_mov_b32_e32 v6, v41
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v38
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v39
+; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v49
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v50
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v51
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v34
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v35
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v37
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX9-NEXT: v_mov_b32_e32 v0, v38
+; GFX9-NEXT: v_mov_b32_e32 v2, v39
+; GFX9-NEXT: v_mov_b32_e32 v8, v48
+; GFX9-NEXT: v_mov_b32_e32 v10, v49
+; GFX9-NEXT: v_mov_b32_e32 v12, v50
+; GFX9-NEXT: v_mov_b32_e32 v14, v51
+; GFX9-NEXT: v_mov_b32_e32 v16, v34
+; GFX9-NEXT: v_mov_b32_e32 v18, v35
+; GFX9-NEXT: v_mov_b32_e32 v20, v36
+; GFX9-NEXT: v_mov_b32_e32 v22, v37
+; GFX9-NEXT: v_mov_b32_e32 v24, v52
+; GFX9-NEXT: v_mov_b32_e32 v26, v53
+; GFX9-NEXT: v_mov_b32_e32 v28, v54
+; GFX9-NEXT: v_mov_b32_e32 v30, v55
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v64bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_dwordx4 v[64:67], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[34:37], v[0:1], off offset:32
+; GFX10-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v65
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v66
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v67
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v51
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v37
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v53
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v55
+; GFX10-NEXT: v_mov_b32_e32 v0, v64
+; GFX10-NEXT: v_mov_b32_e32 v2, v65
+; GFX10-NEXT: v_mov_b32_e32 v4, v66
+; GFX10-NEXT: v_mov_b32_e32 v6, v67
+; GFX10-NEXT: v_mov_b32_e32 v8, v48
+; GFX10-NEXT: v_mov_b32_e32 v10, v49
+; GFX10-NEXT: v_mov_b32_e32 v12, v50
+; GFX10-NEXT: v_mov_b32_e32 v14, v51
+; GFX10-NEXT: v_mov_b32_e32 v16, v34
+; GFX10-NEXT: v_mov_b32_e32 v18, v35
+; GFX10-NEXT: v_mov_b32_e32 v20, v36
+; GFX10-NEXT: v_mov_b32_e32 v22, v37
+; GFX10-NEXT: v_mov_b32_e32 v24, v52
+; GFX10-NEXT: v_mov_b32_e32 v26, v53
+; GFX10-NEXT: v_mov_b32_e32 v28, v54
+; GFX10-NEXT: v_mov_b32_e32 v30, v55
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+ ret <64 x bfloat> %load
+}
+
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dword v[1:2], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <2 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_short v[4:5], v6
+; GFX8-NEXT: flat_store_short v[2:3], v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <3 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_or_b32_e32 v3, v1, v0
-; GCN-NEXT: v_or_b32_e32 v2, v4, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <4 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <8 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v9, v8
+; GCN-NEXT: v_or_b32_e32 v5, v11, v10
+; GCN-NEXT: v_or_b32_e32 v6, v13, v12
+; GCN-NEXT: v_or_b32_e32 v7, v15, v14
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <16 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GCN-NEXT: v_or_b32_e32 v2, v2, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v10
+; GCN-NEXT: v_or_b32_e32 v4, v4, v5
+; GCN-NEXT: v_or_b32_e32 v5, v6, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; GCN-NEXT: v_or_b32_e32 v6, v6, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GCN-NEXT: v_or_b32_e32 v7, v7, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v19
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v18
+; GCN-NEXT: v_or_b32_e32 v8, v8, v9
+; GCN-NEXT: v_or_b32_e32 v9, v10, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20
+; GCN-NEXT: v_or_b32_e32 v10, v10, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v25
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v27
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v29
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v28
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v30
+; GCN-NEXT: v_or_b32_e32 v11, v11, v12
+; GCN-NEXT: v_or_b32_e32 v12, v13, v14
+; GCN-NEXT: v_or_b32_e32 v13, v15, v16
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32
+; GCN-NEXT: v_or_b32_e32 v14, v17, v18
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: v_or_b32_e32 v15, v15, v19
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v10
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v6, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v17
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v16
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v18
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v9, v10, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v21
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v20
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v23
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v25
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v24
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v27
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v26
+; GFX7-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX7-NEXT: v_or_b32_e32 v12, v13, v14
+; GFX7-NEXT: v_or_b32_e32 v13, v15, v16
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v29
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v28
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff, v30
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v15
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT: v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX9-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10
+; GFX9-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX9-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13
+; GFX9-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX9-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15
+; GFX9-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX10-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v10, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v11, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v12, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <32 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GCN-NEXT: v_or_b32_e32 v2, v2, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GCN-NEXT: v_or_b32_e32 v3, v3, v4
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_or_b32_e32 v1, v2, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v12
+; GCN-NEXT: v_or_b32_e32 v2, v2, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v14
+; GCN-NEXT: v_or_b32_e32 v3, v3, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v25
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v27
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v26
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, v6, v7
+; GCN-NEXT: v_or_b32_e32 v1, v8, v9
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_or_b32_e32 v2, v10, v11
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_or_b32_e32 v3, v12, v13
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v29
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v28
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, v14, v15
+; GCN-NEXT: v_or_b32_e32 v1, v16, v17
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_or_b32_e32 v2, v12, v13
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v30
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GCN-NEXT: v_or_b32_e32 v3, v16, v3
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_or_b32_e32 v1, v2, v3
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v9
+; GCN-NEXT: v_or_b32_e32 v2, v2, v3
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v11
+; GCN-NEXT: s_waitcnt vmcnt(12)
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v14
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v21
+; GCN-NEXT: s_waitcnt vmcnt(10)
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v15
+; GCN-NEXT: v_or_b32_e32 v3, v3, v6
+; GCN-NEXT: v_or_b32_e32 v6, v7, v8
+; GCN-NEXT: v_or_b32_e32 v7, v10, v11
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(10)
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v13
+; GCN-NEXT: v_or_b32_e32 v8, v8, v10
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(9)
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v17
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v19
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v22
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v23
+; GCN-NEXT: v_or_b32_e32 v9, v10, v11
+; GCN-NEXT: v_or_b32_e32 v10, v16, v17
+; GCN-NEXT: v_or_b32_e32 v11, v18, v19
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v24
+; GCN-NEXT: v_or_b32_e32 v12, v12, v18
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GCN-NEXT: v_or_b32_e32 v13, v13, v14
+; GCN-NEXT: v_or_b32_e32 v14, v16, v15
+; GCN-NEXT: v_or_b32_e32 v15, v18, v17
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GCN-NEXT: v_or_b32_e32 v16, v18, v16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GCN-NEXT: v_or_b32_e32 v17, v17, v18
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[4:5], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[4:5], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v64bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v35, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v37, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v38, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v31, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v12
+; GFX7-NEXT: v_or_b32_e32 v36, v3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v10
+; GFX7-NEXT: v_or_b32_e32 v33, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v14
+; GFX7-NEXT: v_or_b32_e32 v32, v2, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_or_b32_e32 v34, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v16
+; GFX7-NEXT: v_or_b32_e32 v4, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v19
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v18
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v21
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v20
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v23
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v22
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v25
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v24
+; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v27
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v26
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v29
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v28
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v30
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v8, v9, v8
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v15
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX7-NEXT: v_or_b32_e32 v9, v9, v13
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX7-NEXT: v_or_b32_e32 v10, v12, v10
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: buffer_store_dwordx4 v[35:38], v[24:25], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_store_dwordx4 v[31:34], v[24:25], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v11, v12, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: s_waitcnt vmcnt(13)
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(12)
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v17
+; GFX7-NEXT: v_or_b32_e32 v13, v13, v14
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v18
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v21
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v20
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v23
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff, v22
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff, v26
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_or_b32_e32 v17, v17, v18
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v29
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v23
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX7-NEXT: v_or_b32_e32 v19, v20, v19
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v26
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_or_b32_e32 v21, v22, v23
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v28
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v23, v23, v26
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[24:25], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[24:25], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[24:25], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[24:25], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: buffer_store_dwordx4 v[16:19], v[24:25], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: buffer_store_dwordx4 v[20:23], v[24:25], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v64bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX8-NEXT: v_mov_b32_sdwa v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX8-NEXT: v_mov_b32_sdwa v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX8-NEXT: v_mov_b32_sdwa v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v7
+; GFX8-NEXT: v_mov_b32_sdwa v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v34, vcc, 16, v32
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX8-NEXT: v_mov_b32_sdwa v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; GFX8-NEXT: v_mov_b32_sdwa v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v11
+; GFX8-NEXT: v_mov_b32_sdwa v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v12
+; GFX8-NEXT: v_mov_b32_sdwa v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v13
+; GFX8-NEXT: v_mov_b32_sdwa v12, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT: v_mov_b32_sdwa v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v15
+; GFX8-NEXT: v_mov_b32_sdwa v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_mov_b32_sdwa v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v32
+; GFX8-NEXT: v_mov_b32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v32
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX8-NEXT: v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX8-NEXT: v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; GFX8-NEXT: v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX8-NEXT: v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GFX8-NEXT: v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX8-NEXT: v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; GFX8-NEXT: v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX8-NEXT: v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; GFX8-NEXT: v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX8-NEXT: v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; GFX8-NEXT: v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX8-NEXT: v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX8-NEXT: v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX8-NEXT: v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: s_waitcnt vmcnt(4)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GFX8-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v32
+; GFX8-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v32, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x60, v32
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x70, v32
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v64bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX9-NEXT: v_mov_b32_sdwa v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; GFX9-NEXT: v_mov_b32_sdwa v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX9-NEXT: v_mov_b32_sdwa v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v3
+; GFX9-NEXT: v_mov_b32_sdwa v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_mov_b32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NEXT: v_mov_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX9-NEXT: v_mov_b32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-NEXT: v_mov_b32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; GFX9-NEXT: v_mov_b32_sdwa v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NEXT: v_mov_b32_sdwa v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX9-NEXT: v_mov_b32_sdwa v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v13
+; GFX9-NEXT: v_mov_b32_sdwa v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX9-NEXT: v_mov_b32_sdwa v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v15
+; GFX9-NEXT: v_mov_b32_sdwa v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NEXT: v_mov_b32_sdwa v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v17
+; GFX9-NEXT: v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX9-NEXT: v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-NEXT: v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX9-NEXT: v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GFX9-NEXT: v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX9-NEXT: v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; GFX9-NEXT: v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX9-NEXT: v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; GFX9-NEXT: v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX9-NEXT: v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; GFX9-NEXT: v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX9-NEXT: v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX9-NEXT: v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX9-NEXT: v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v31
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v64bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX10-NEXT: v_mov_b32_sdwa v0, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v68, 16, v18
+; GFX10-NEXT: v_mov_b32_sdwa v1, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX10-NEXT: v_mov_b32_sdwa v2, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v21
+; GFX10-NEXT: v_mov_b32_sdwa v3, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX10-NEXT: v_mov_b32_sdwa v4, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v23
+; GFX10-NEXT: v_mov_b32_sdwa v5, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX10-NEXT: v_mov_b32_sdwa v6, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v25
+; GFX10-NEXT: v_mov_b32_sdwa v7, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v26
+; GFX10-NEXT: v_mov_b32_sdwa v8, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v27
+; GFX10-NEXT: v_mov_b32_sdwa v9, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; GFX10-NEXT: v_mov_b32_sdwa v10, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v29
+; GFX10-NEXT: v_mov_b32_sdwa v11, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_mov_b32_sdwa v19, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v12, v54 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v13, v55 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v14, v64 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v15, v65 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v16, v66 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v17, v67 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v18, v68 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v20, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v21, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v22, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v23, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v24, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v25, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v26, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v27, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v29, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v30, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v31
+; GFX10-NEXT: v_mov_b32_sdwa v31, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <64 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_store_fpimm:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_store_fpimm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT: flat_store_short v[0:1], v4
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_store_fpimm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT: global_store_short v[0:1], v4, off
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_store_fpimm:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT: global_store_short v[0:1], v4, off
+; GFX10-NEXT: global_store_short v[2:3], v5, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store bfloat 1.0, ptr addrspace(1) %ptr0
+ store bfloat 42.0, ptr addrspace(1) %ptr1
+ ret void
+}
+
+; FIXME: unable to translate instruction: fptrunc
+; define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; %val = load float, ptr addrspace(1) %in
+; %val.bf16 = fptrunc float %val to bfloat
+; store bfloat %val.bf16, ptr addrspace(1) %out
+; ret void
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; %val = load double, ptr addrspace(1) %in
+; %val.bf16 = fptrunc double %val to bfloat
+; store bfloat %val.bf16, ptr addrspace(1) %out
+; ret void
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; %val = load bfloat, ptr addrspace(1) %in
+; %val.f32 = fpext bfloat %val to float
+; store float %val.f32, ptr addrspace(1) %out
+; ret void
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; %val = load bfloat, ptr addrspace(1) %in
+; %val.f64 = fpext bfloat %val to double
+; store double %val.f64, ptr addrspace(1) %out
+; ret void
+; }
+
+define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dword v[2:3], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load <2 x bfloat>, ptr addrspace(1) %in
+ store <2 x bfloat> %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load <4 x bfloat>, ptr addrspace(1) %in
+ store <4 x bfloat> %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load <8 x bfloat>, ptr addrspace(1) %in
+ store <8 x bfloat> %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load <16 x bfloat>, ptr addrspace(1) %in
+ store <16 x bfloat> %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_short v[1:2], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_short v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_short v[1:2], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store bfloat %in, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dword v[1:2], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <2 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
+; GCN-LABEL: test_arg_store_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_short v[4:5], v6
+; GFX8-NEXT: flat_store_short v[2:3], v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
+ ret void
+}
+
+define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <4 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <8 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v9, v8
+; GCN-NEXT: v_or_b32_e32 v5, v11, v10
+; GCN-NEXT: v_or_b32_e32 v6, v13, v12
+; GCN-NEXT: v_or_b32_e32 v7, v15, v14
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store <16 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_inreg_arg_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: s_mov_b32 s38, 0
+; GCN-NEXT: s_mov_b32 s39, 0xf000
+; GCN-NEXT: s_mov_b64 s[36:37], 0
+; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_inreg_arg_store:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b32 s38, 0
+; GFX7-NEXT: s_mov_b32 s39, 0xf000
+; GFX7-NEXT: s_mov_b64 s[36:37], 0
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_inreg_arg_store:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_inreg_arg_store:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_inreg_arg_store:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store bfloat %in, ptr addrspace(1) %out
+ ret void
+}
+
+define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
+; GCN-LABEL: test_byval:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_byval:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_byval:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_byval:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_byval:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store bfloat %val, ptr addrspace(5) %bv
+ %retval = load bfloat, ptr addrspace(5) %bv
+ ret bfloat %retval
+}
+
+define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
+; GCN-LABEL: test_sret:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_sret:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_sret:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_sret:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_sret:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ store bfloat %val, ptr addrspace(5) %sret
+ ret void
+}
+
+define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_bitcast_from_bfloat:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_bitcast_from_bfloat:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_bitcast_from_bfloat:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_bitcast_from_bfloat:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_bitcast_from_bfloat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load bfloat, ptr addrspace(1) %in
+ %val_int = bitcast bfloat %val to i16
+ store i16 %val_int, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: test_bitcast_to_bfloat:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_bitcast_to_bfloat:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_bitcast_to_bfloat:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_ushort v2, v[2:3]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_bitcast_to_bfloat:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_bitcast_to_bfloat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v2, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load i16, ptr addrspace(1) %in
+ %val_fp = bitcast i16 %val to bfloat
+ store bfloat %val_fp, ptr addrspace(1) %out
+ ret void
+}
+
+define bfloat @test_ret(bfloat %in) {
+; GCN-LABEL: test_ret:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret bfloat %in
+}
+
+define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
+; GCN-LABEL: test_ret_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v2bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v2bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v2bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret <2 x bfloat> %in
+}
+
+define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
+; GCN-LABEL: test_ret_v3bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v3bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v3bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v3bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v3bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret <3 x bfloat> %in
+}
+
+define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
+; GCN-LABEL: test_ret_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v4bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v4bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v4bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret <4 x bfloat> %in
+}
+
+define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
+; GCN-LABEL: test_ret_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v8bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v8bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v8bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret <8 x bfloat> %in
+}
+
+define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
+; GCN-LABEL: test_ret_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_ret_v16bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_ret_v16bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v8
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_ret_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_ret_v16bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ret <16 x bfloat> %in
+}
+
+define void @test_call(bfloat %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v2, s30, 0
+; GCN-NEXT: v_writelane_b32 v2, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v2, 1
+; GCN-NEXT: v_readlane_b32 s30, v2, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v2, s30, 0
+; GFX7-NEXT: v_writelane_b32 v2, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v2, 1
+; GFX7-NEXT: v_readlane_b32 s30, v2, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_writelane_b32 v2, s30, 0
+; GFX8-NEXT: v_writelane_b32 v2, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v2, 1
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_writelane_b32 v2, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_readlane_b32 s31, v2, 1
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call bfloat @test_arg_store(bfloat %in)
+ store volatile bfloat %result, ptr addrspace(5) %out
+ ret void
+}
+
+define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v3, s30, 0
+; GCN-NEXT: v_writelane_b32 v3, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v3, 1
+; GCN-NEXT: v_readlane_b32 s30, v3, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v2bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v3, s30, 0
+; GFX7-NEXT: v_writelane_b32 v3, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v3, 1
+; GFX7-NEXT: v_readlane_b32 s30, v3, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v2bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_writelane_b32 v2, s30, 0
+; GFX8-NEXT: v_writelane_b32 v2, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v2, 1
+; GFX8-NEXT: v_readlane_b32 s30, v2, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v2bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v2, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_writelane_b32 v2, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_readlane_b32 s31, v2, 1
+; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
+ store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+ ret void
+}
+
+define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v3bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v4, s30, 0
+; GCN-NEXT: v_writelane_b32 v4, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v4, 1
+; GCN-NEXT: v_readlane_b32 s30, v4, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v3bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v4, s30, 0
+; GFX7-NEXT: v_writelane_b32 v4, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3
+; GFX7-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v4, 1
+; GFX7-NEXT: v_readlane_b32 s30, v4, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v3bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_writelane_b32 v3, s30, 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_writelane_b32 v3, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v3, 1
+; GFX8-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v3bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v3bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
+ store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+ ret void
+}
+
+define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v4bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v5, s30, 0
+; GCN-NEXT: v_writelane_b32 v5, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v5, 1
+; GCN-NEXT: v_readlane_b32 s30, v5, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v4bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v5, s30, 0
+; GFX7-NEXT: v_writelane_b32 v5, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v5, 1
+; GFX7-NEXT: v_readlane_b32 s30, v5, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v4bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_writelane_b32 v3, s30, 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_writelane_b32 v3, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
+; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v3, 1
+; GFX8-NEXT: v_readlane_b32 s30, v3, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v4bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v3, s30, 0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v3, 1
+; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v4bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT: v_writelane_b32 v3, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_writelane_b32 v3, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_readlane_b32 s31, v3, 1
+; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
+ store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+ ret void
+}
+
+define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v8bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v9, s30, 0
+; GCN-NEXT: v_writelane_b32 v9, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v8
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v8
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v8
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v2, v11, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v3, v12, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v9, 1
+; GCN-NEXT: v_readlane_b32 s30, v9, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v8bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v9, s30, 0
+; GFX7-NEXT: v_writelane_b32 v9, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v8
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v8
+; GFX7-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v9, 1
+; GFX7-NEXT: v_readlane_b32 s30, v9, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v8bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-NEXT: v_writelane_b32 v5, s30, 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_writelane_b32 v5, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4
+; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4
+; GFX8-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v5, 1
+; GFX8-NEXT: v_readlane_b32 s30, v5, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v8bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_writelane_b32 v5, s30, 0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_writelane_b32 v5, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v5, 1
+; GFX9-NEXT: v_readlane_b32 s30, v5, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v8bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_writelane_b32 v5, s30, 0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_writelane_b32 v5, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT: v_readlane_b32 s31, v5, 1
+; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_readlane_b32 s30, v5, 0
+; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
+ store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+ ret void
+}
+
+define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v16bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_writelane_b32 v17, s30, 0
+; GCN-NEXT: v_writelane_b32 v17, s31, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT: v_add_i32_e32 v18, vcc, 4, v16
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 8, v16
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 12, v16
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v16
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v16
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 24, v16
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 28, v16
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v3, v2
+; GCN-NEXT: v_or_b32_e32 v2, v5, v4
+; GCN-NEXT: v_or_b32_e32 v3, v7, v6
+; GCN-NEXT: v_or_b32_e32 v4, v9, v8
+; GCN-NEXT: v_or_b32_e32 v5, v11, v10
+; GCN-NEXT: v_or_b32_e32 v6, v13, v12
+; GCN-NEXT: v_or_b32_e32 v7, v15, v14
+; GCN-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v5, v22, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, v23, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readlane_b32 s31, v17, 1
+; GCN-NEXT: v_readlane_b32 s30, v17, 0
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
+; GCN-NEXT: s_mov_b32 s33, s8
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_call_v16bf16:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s8, s33
+; GFX7-NEXT: s_mov_b32 s33, s32
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0x400
+; GFX7-NEXT: s_getpc_b64 s[4:5]
+; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT: v_writelane_b32 v17, s30, 0
+; GFX7-NEXT: v_writelane_b32 v17, s31, 1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX7-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v16
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
+; GFX7-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v16
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
+; GFX7-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 20, v16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
+; GFX7-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 24, v16
+; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX7-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 28, v16
+; GFX7-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readlane_b32 s31, v17, 1
+; GFX7-NEXT: v_readlane_b32 s30, v17, 0
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
+; GFX7-NEXT: s_mov_b32 s33, s8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_call_v16bf16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s6, s33
+; GFX8-NEXT: s_mov_b32 s33, s32
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0x400
+; GFX8-NEXT: s_getpc_b64 s[4:5]
+; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v10, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_writelane_b32 v9, s30, 0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v10
+; GFX8-NEXT: v_writelane_b32 v9, s31, 1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 24, v8
+; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 28, v8
+; GFX8-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readlane_b32 s31, v9, 1
+; GFX8-NEXT: v_readlane_b32 s30, v9, 0
+; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
+; GFX8-NEXT: s_mov_b32 s33, s6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v16bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-NEXT: v_writelane_b32 v9, s30, 0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-NEXT: v_writelane_b32 v9, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readlane_b32 s31, v9, 1
+; GFX9-NEXT: v_readlane_b32 s30, v9, 0
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
+; GFX9-NEXT: s_mov_b32 s33, s6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_v16bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, s33
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v10, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_writelane_b32 v9, s30, 0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v10
+; GFX10-NEXT: v_writelane_b32 v9, s31, 1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_readlane_b32 s31, v9, 1
+; GFX10-NEXT: v_readlane_b32 s30, v9, 0
+; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
+; GFX10-NEXT: s_mov_b32 s33, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
+ store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+ ret void
+}
+
+define bfloat @test_alloca_load_store_ret(bfloat %in) {
+; GCN-LABEL: test_alloca_load_store_ret:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_alloca_load_store_ret:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_alloca_load_store_ret:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_alloca_load_store_ret:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_alloca_load_store_ret:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %in.addr = alloca bfloat, align 2, addrspace(5)
+ store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
+ %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
+ ret bfloat %loaded
+}
+
+define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
+; GCN-LABEL: test_overflow_stack:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 20, v0
+; GCN-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 24, v0
+; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v0
+; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 36, v0
+; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; GCN-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 52, v0
+; GCN-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 56, v0
+; GCN-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 60, v0
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 64, v0
+; GCN-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v5, 0x44
+; GCN-NEXT: v_mov_b32_e32 v7, 0x48
+; GCN-NEXT: buffer_store_dword v18, v6, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v6, 0x4c
+; GCN-NEXT: v_mov_b32_e32 v8, 0x50
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v5, 0x54
+; GCN-NEXT: v_mov_b32_e32 v9, 0x58
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v7
+; GCN-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v7, 0x5c
+; GCN-NEXT: v_mov_b32_e32 v10, 0x60
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v0, v6
+; GCN-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v6, 0x64
+; GCN-NEXT: v_mov_b32_e32 v11, 0x68
+; GCN-NEXT: v_add_i32_e32 v8, vcc, v0, v8
+; GCN-NEXT: buffer_store_dword v22, v8, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v8, 0x6c
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x74, v0
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, v0, v9
+; GCN-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x7c, v0
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x80, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v7
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v0, v10
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v0, v6
+; GCN-NEXT: v_add_i32_e32 v11, vcc, v0, v11
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v27, v6, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_overflow_stack:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0
+; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0
+; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0
+; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0
+; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0
+; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 52, v0
+; GFX7-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 56, v0
+; GFX7-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
+; GFX7-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 64, v0
+; GFX7-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x44
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x48
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x4c
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x54
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x58
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x5c
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x60
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x64
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x68
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x6c
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
+; GFX7-NEXT: buffer_store_dword v29, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0
+; GFX7-NEXT: buffer_store_dword v30, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x74, v0
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
+; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_overflow_stack:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0
+; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0
+; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0
+; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0
+; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0
+; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0
+; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 48, v0
+; GFX8-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0
+; GFX8-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
+; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
+; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
+; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x44
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x48
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x4c
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x54
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x58
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x5c
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x60
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x64
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x68
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x6c
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
+; GFX8-NEXT: buffer_store_dword v29, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0
+; GFX8-NEXT: buffer_store_dword v30, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x74, v0
+; GFX8-NEXT: s_waitcnt vmcnt(14)
+; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
+; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7c, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
+; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_overflow_stack:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32
+; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: s_waitcnt vmcnt(20)
+; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_overflow_stack:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
+ %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
+ ret { <32 x i32>, bfloat } %ins.1
+}
+
+; FIXME: unable to translate instruction: fpext
+; define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
+; %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <2 x bfloat> %load to <2 x float>
+; ret <2 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
+; %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <3 x bfloat> %load to <3 x float>
+; ret <3 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
+; %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <4 x bfloat> %load to <4 x float>
+; ret <4 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
+; %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <5 x bfloat> %load to <5 x float>
+; ret <5 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
+; %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <6 x bfloat> %load to <6 x float>
+; ret <6 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
+; %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <8 x bfloat> %load to <8 x float>
+; ret <8 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
+; %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <16 x bfloat> %load to <16 x float>
+; ret <16 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
+; %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <32 x bfloat> %load to <32 x float>
+; ret <32 x float> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
+; %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <2 x bfloat> %load to <2 x double>
+; ret <2 x double> %fpext
+; }
+
+; define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
+; %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <3 x bfloat> %load to <3 x double>
+; ret <3 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
+; %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <4 x bfloat> %load to <4 x double>
+; ret <4 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
+; %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <5 x bfloat> %load to <5 x double>
+; ret <5 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
+; %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <6 x bfloat> %load to <6 x double>
+; ret <6 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
+; %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <8 x bfloat> %load to <8 x double>
+; ret <8 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
+; %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <16 x bfloat> %load to <16 x double>
+; ret <16 x double> %fpext
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
+; %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+; %fpext = fpext <32 x bfloat> %load to <32 x double>
+; ret <32 x double> %fpext
+; }
+
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fadd_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd bfloat %a, %b
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_add_f32_e32 v0, v0, v2
+; GCN-NEXT: v_add_f32_e32 v1, v1, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <2 x bfloat> %a, %b
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_add_f32_e32 v0, v0, v3
+; GCN-NEXT: v_add_f32_e32 v1, v1, v4
+; GCN-NEXT: v_add_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <3 x bfloat> %a, %b
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_add_f32_e32 v0, v0, v4
+; GCN-NEXT: v_add_f32_e32 v1, v1, v5
+; GCN-NEXT: v_add_f32_e32 v2, v2, v6
+; GCN-NEXT: v_add_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <4 x bfloat> %a, %b
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: v_add_f32_e32 v0, v0, v8
+; GCN-NEXT: v_add_f32_e32 v1, v1, v9
+; GCN-NEXT: v_add_f32_e32 v2, v2, v10
+; GCN-NEXT: v_add_f32_e32 v3, v3, v11
+; GCN-NEXT: v_add_f32_e32 v4, v4, v12
+; GCN-NEXT: v_add_f32_e32 v5, v5, v13
+; GCN-NEXT: v_add_f32_e32 v6, v6, v14
+; GCN-NEXT: v_add_f32_e32 v7, v7, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v6, v0, v4
+; GFX8-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v2, v1, v5
+; GFX8-NEXT: v_add_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT: v_pk_add_f16 v2, v1, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX10-NEXT: v_pk_add_f16 v2, v1, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <8 x bfloat> %a, %b
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_add_f32_e32 v0, v0, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT: v_add_f32_e32 v1, v1, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT: v_add_f32_e32 v2, v2, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT: v_add_f32_e32 v3, v3, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT: v_add_f32_e32 v4, v4, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT: v_add_f32_e32 v5, v5, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT: v_add_f32_e32 v6, v6, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT: v_add_f32_e32 v7, v7, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT: v_add_f32_e32 v8, v8, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT: v_add_f32_e32 v9, v9, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT: v_add_f32_e32 v10, v10, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT: v_add_f32_e32 v11, v11, v17
+; GCN-NEXT: v_add_f32_e32 v12, v12, v18
+; GCN-NEXT: v_add_f32_e32 v13, v13, v19
+; GCN-NEXT: v_add_f32_e32 v14, v14, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_add_f32_e32 v15, v15, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT: v_add_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT: v_add_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT: v_add_f32_e32 v10, v10, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT: v_add_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_add_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v14, v14, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v12, v0, v8
+; GFX8-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v13, v1, v9
+; GFX8-NEXT: v_add_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v10
+; GFX8-NEXT: v_add_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v6, v3, v11
+; GFX8-NEXT: v_add_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v12
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v3, v9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v8
+; GFX9-NEXT: v_pk_add_f16 v8, v1, v9
+; GFX9-NEXT: v_pk_add_f16 v4, v2, v10
+; GFX9-NEXT: v_pk_add_f16 v6, v3, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_add_f16 v8, v1, v9
+; GFX10-NEXT: v_pk_add_f16 v4, v2, v10
+; GFX10-NEXT: v_pk_add_f16 v6, v3, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <16 x bfloat> %a, %b
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_add_f32_e32 v0, v0, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_add_f32_e32 v1, v1, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v2, v2, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_add_f32_e32 v3, v3, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v4, v4, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_add_f32_e32 v5, v5, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v6, v6, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_add_f32_e32 v7, v7, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v8, v8, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_add_f32_e32 v9, v9, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v10, v10, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_add_f32_e32 v11, v11, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v12, v12, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_add_f32_e32 v13, v13, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v14, v14, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_add_f32_e32 v15, v15, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v16, v16, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_add_f32_e32 v17, v17, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v18, v18, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_add_f32_e32 v19, v19, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v20, v20, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_add_f32_e32 v21, v21, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v22, v22, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_add_f32_e32 v23, v23, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v24, v24, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_add_f32_e32 v25, v25, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v26, v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_add_f32_e32 v27, v27, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_add_f32_e32 v28, v28, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_add_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_add_f32_e32 v30, v30, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT: v_add_f32_e32 v31, v31, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v24, v0, v16
+; GFX8-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v25, v1, v17
+; GFX8-NEXT: v_add_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v26, v2, v18
+; GFX8-NEXT: v_add_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v27, v3, v19
+; GFX8-NEXT: v_add_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v8, v4, v20
+; GFX8-NEXT: v_add_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v10, v5, v21
+; GFX8-NEXT: v_add_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v12, v6, v22
+; GFX8-NEXT: v_add_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v14, v7, v23
+; GFX8-NEXT: v_add_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v24
+; GFX8-NEXT: v_mov_b32_e32 v1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, v25
+; GFX8-NEXT: v_mov_b32_e32 v3, v17
+; GFX8-NEXT: v_mov_b32_e32 v4, v26
+; GFX8-NEXT: v_mov_b32_e32 v5, v18
+; GFX8-NEXT: v_mov_b32_e32 v6, v27
+; GFX8-NEXT: v_mov_b32_e32 v7, v19
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v16
+; GFX9-NEXT: v_pk_add_f16 v16, v1, v17
+; GFX9-NEXT: v_pk_add_f16 v18, v2, v18
+; GFX9-NEXT: v_pk_add_f16 v17, v3, v19
+; GFX9-NEXT: v_pk_add_f16 v8, v4, v20
+; GFX9-NEXT: v_pk_add_f16 v10, v5, v21
+; GFX9-NEXT: v_pk_add_f16 v12, v6, v22
+; GFX9-NEXT: v_pk_add_f16 v14, v7, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT: v_mov_b32_e32 v2, v16
+; GFX9-NEXT: v_mov_b32_e32 v4, v18
+; GFX9-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v16
+; GFX10-NEXT: v_pk_add_f16 v16, v1, v17
+; GFX10-NEXT: v_pk_add_f16 v18, v2, v18
+; GFX10-NEXT: v_pk_add_f16 v17, v3, v19
+; GFX10-NEXT: v_pk_add_f16 v8, v4, v20
+; GFX10-NEXT: v_pk_add_f16 v10, v5, v21
+; GFX10-NEXT: v_pk_add_f16 v12, v6, v22
+; GFX10-NEXT: v_pk_add_f16 v14, v7, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_mov_b32_e32 v2, v16
+; GFX10-NEXT: v_mov_b32_e32 v4, v18
+; GFX10-NEXT: v_mov_b32_e32 v6, v17
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fadd <32 x bfloat> %a, %b
+ ret <32 x bfloat> %op
+}
+
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, 0x3f80
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, 0x3f80
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v0, 0x3f80, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_f16_e32 v0, 0x3f80, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f16_e32 v0, 0x3f80, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %add = fadd bfloat %arg0, 1.0
+ ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, 0x4228
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, 0x4228
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v0, 0x4228, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_f16_e32 v0, 0x4228, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f16_e32 v0, 0x4228, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %add = fadd bfloat %arg0, 42.0
+ ret bfloat %add
+}
+
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fsub_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fsub bfloat %a, %b
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_or_b32_e32 v2, v3, v2
+; GCN-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_add_f32_e32 v0, v0, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_add_f32_e32 v1, v1, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT: v_add_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fsub <2 x bfloat> %a, %b
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e64 v3, -v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e64 v4, -v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v5
+; GCN-NEXT: v_add_f32_e32 v0, v0, v3
+; GCN-NEXT: v_add_f32_e32 v1, v1, v4
+; GCN-NEXT: v_add_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v4
+; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX9-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX10-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fsub <3 x bfloat> %a, %b
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e64 v4, -v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e64 v6, -v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e64 v7, -v7
+; GCN-NEXT: v_add_f32_e32 v0, v0, v4
+; GCN-NEXT: v_add_f32_e32 v1, v1, v5
+; GCN-NEXT: v_add_f32_e32 v2, v2, v6
+; GCN-NEXT: v_add_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e64 v5, -v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v6
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e64 v5, -v7
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX9-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_f16_e32 v3, v0, v2
+; GFX10-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fsub <4 x bfloat> %a, %b
+ ret <4 x bfloat> %op
+}
+
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fmul_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul bfloat %a, %b
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <2 x bfloat> %a, %b
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <3 x bfloat> %a, %b
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v3, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <4 x bfloat> %a, %b
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v6, v0, v4
+; GFX8-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v1, v5
+; GFX8-NEXT: v_mul_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
+; GFX9-NEXT: v_pk_mul_f16 v2, v1, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4
+; GFX10-NEXT: v_pk_mul_f16 v2, v1, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <8 x bfloat> %a, %b
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v17
+; GCN-NEXT: v_mul_f32_e32 v12, v12, v18
+; GCN-NEXT: v_mul_f32_e32 v13, v13, v19
+; GCN-NEXT: v_mul_f32_e32 v14, v14, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT: v_mul_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT: v_mul_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT: v_mul_f32_e32 v10, v10, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT: v_mul_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_mul_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v14, v14, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_mul_f32_e32 v15, v15, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v12, v0, v8
+; GFX8-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v13, v1, v9
+; GFX8-NEXT: v_mul_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v2, v10
+; GFX8-NEXT: v_mul_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v6, v3, v11
+; GFX8-NEXT: v_mul_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v12
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v3, v9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v8
+; GFX9-NEXT: v_pk_mul_f16 v8, v1, v9
+; GFX9-NEXT: v_pk_mul_f16 v4, v2, v10
+; GFX9-NEXT: v_pk_mul_f16 v6, v3, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_mul_f16 v8, v1, v9
+; GFX10-NEXT: v_pk_mul_f16 v4, v2, v10
+; GFX10-NEXT: v_pk_mul_f16 v6, v3, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <16 x bfloat> %a, %b
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_mul_f32_e32 v13, v13, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_mul_f32_e32 v15, v15, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v17, v17, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_mul_f32_e32 v19, v19, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_mul_f32_e32 v21, v21, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_mul_f32_e32 v23, v23, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_mul_f32_e32 v25, v25, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_mul_f32_e32 v27, v27, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_mul_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_mul_f32_e32 v30, v30, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v24, v0, v16
+; GFX8-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v25, v1, v17
+; GFX8-NEXT: v_mul_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v26, v2, v18
+; GFX8-NEXT: v_mul_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v27, v3, v19
+; GFX8-NEXT: v_mul_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v8, v4, v20
+; GFX8-NEXT: v_mul_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v10, v5, v21
+; GFX8-NEXT: v_mul_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v12, v6, v22
+; GFX8-NEXT: v_mul_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v14, v7, v23
+; GFX8-NEXT: v_mul_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v24
+; GFX8-NEXT: v_mov_b32_e32 v1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, v25
+; GFX8-NEXT: v_mov_b32_e32 v3, v17
+; GFX8-NEXT: v_mov_b32_e32 v4, v26
+; GFX8-NEXT: v_mov_b32_e32 v5, v18
+; GFX8-NEXT: v_mov_b32_e32 v6, v27
+; GFX8-NEXT: v_mov_b32_e32 v7, v19
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v16
+; GFX9-NEXT: v_pk_mul_f16 v16, v1, v17
+; GFX9-NEXT: v_pk_mul_f16 v18, v2, v18
+; GFX9-NEXT: v_pk_mul_f16 v17, v3, v19
+; GFX9-NEXT: v_pk_mul_f16 v8, v4, v20
+; GFX9-NEXT: v_pk_mul_f16 v10, v5, v21
+; GFX9-NEXT: v_pk_mul_f16 v12, v6, v22
+; GFX9-NEXT: v_pk_mul_f16 v14, v7, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT: v_mov_b32_e32 v2, v16
+; GFX9-NEXT: v_mov_b32_e32 v4, v18
+; GFX9-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v16
+; GFX10-NEXT: v_pk_mul_f16 v16, v1, v17
+; GFX10-NEXT: v_pk_mul_f16 v18, v2, v18
+; GFX10-NEXT: v_pk_mul_f16 v17, v3, v19
+; GFX10-NEXT: v_pk_mul_f16 v8, v4, v20
+; GFX10-NEXT: v_pk_mul_f16 v10, v5, v21
+; GFX10-NEXT: v_pk_mul_f16 v12, v6, v22
+; GFX10-NEXT: v_pk_mul_f16 v14, v7, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_mov_b32_e32 v2, v16
+; GFX10-NEXT: v_mov_b32_e32 v4, v18
+; GFX10-NEXT: v_mov_b32_e32 v6, v17
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <32 x bfloat> %a, %b
+ ret <32 x bfloat> %op
+}
+
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fdiv_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GCN-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GCN-NEXT: v_rcp_f32_e32 v4, v2
+; GCN-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GCN-NEXT: v_fma_f32 v4, v5, v4, v4
+; GCN-NEXT: v_mul_f32_e32 v5, v3, v4
+; GCN-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GCN-NEXT: v_fma_f32 v5, v6, v4, v5
+; GCN-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GCN-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fdiv_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX7-NEXT: v_rcp_f32_e32 v3, v2
+; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fdiv bfloat %a, %b
+ ret bfloat %op
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define bfloat @v_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fabs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fabs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fabs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fabs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fabs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fabs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_fabs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fabs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fabs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fabs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+define bfloat @v_fneg_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fneg bfloat %a
+ ret bfloat %op
+}
+
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %op = fneg bfloat %a
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_fabs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_fabs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_fabs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_fabs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_fabs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+ %op = fneg bfloat %fabs
+ ret bfloat %op
+}
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_fabs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_bitset1_b32 s0, 15
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_fabs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_bitset1_b32 s0, 15
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_fabs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_bitset1_b32 s0, 15
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_fabs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_bitset1_b32 s0, 15
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_fabs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_bitset1_b32 s0, 15
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+ %op = fneg bfloat %fabs
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_minnum_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_min_f32_e32 v0, v0, v2
+; GCN-NEXT: v_min_f32_e32 v1, v1, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_min_f32_e32 v0, v0, v3
+; GCN-NEXT: v_min_f32_e32 v1, v1, v4
+; GCN-NEXT: v_min_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v3, v1, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_min_f32_e32 v0, v0, v4
+; GCN-NEXT: v_min_f32_e32 v1, v1, v5
+; GCN-NEXT: v_min_f32_e32 v2, v2, v6
+; GCN-NEXT: v_min_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT: v_min_f16_e32 v3, v1, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: v_min_f32_e32 v0, v0, v8
+; GCN-NEXT: v_min_f32_e32 v1, v1, v9
+; GCN-NEXT: v_min_f32_e32 v2, v2, v10
+; GCN-NEXT: v_min_f32_e32 v3, v3, v11
+; GCN-NEXT: v_min_f32_e32 v4, v4, v12
+; GCN-NEXT: v_min_f32_e32 v5, v5, v13
+; GCN-NEXT: v_min_f32_e32 v6, v6, v14
+; GCN-NEXT: v_min_f32_e32 v7, v7, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX8-NEXT: v_min_f16_e32 v6, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v2, v5, v5
+; GFX8-NEXT: v_min_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX9-NEXT: v_pk_min_f16 v2, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_min_f16 v2, v1, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_min_f32_e32 v0, v0, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT: v_min_f32_e32 v1, v1, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT: v_min_f32_e32 v2, v2, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT: v_min_f32_e32 v3, v3, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT: v_min_f32_e32 v4, v4, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT: v_min_f32_e32 v5, v5, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT: v_min_f32_e32 v6, v6, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT: v_min_f32_e32 v7, v7, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT: v_min_f32_e32 v8, v8, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT: v_min_f32_e32 v9, v9, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT: v_min_f32_e32 v10, v10, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT: v_min_f32_e32 v11, v11, v17
+; GCN-NEXT: v_min_f32_e32 v12, v12, v18
+; GCN-NEXT: v_min_f32_e32 v13, v13, v19
+; GCN-NEXT: v_min_f32_e32 v14, v14, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_min_f32_e32 v15, v15, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v5, v8, v8
+; GFX8-NEXT: v_min_f16_e32 v12, v4, v5
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v4, v9, v9
+; GFX8-NEXT: v_min_f16_e32 v13, v0, v4
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v9, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v10, v10
+; GFX8-NEXT: v_min_f16_e32 v4, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v5, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v11, v11
+; GFX8-NEXT: v_min_f16_e32 v6, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v7, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v12
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v3, v9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v4, v8, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v4, v9, v9
+; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v8, v1, v4
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v10, v10
+; GFX9-NEXT: v_pk_min_f16 v4, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v2, v11, v11
+; GFX9-NEXT: v_pk_min_f16 v6, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v4, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v5, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v6, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v7, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX10-NEXT: v_pk_min_f16 v2, v1, v5
+; GFX10-NEXT: v_pk_min_f16 v4, v6, v7
+; GFX10-NEXT: v_pk_min_f16 v6, v3, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_min_f32_e32 v0, v0, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_min_f32_e32 v1, v1, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v2, v2, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_min_f32_e32 v3, v3, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v4, v4, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_min_f32_e32 v5, v5, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v6, v6, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_min_f32_e32 v7, v7, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v8, v8, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_min_f32_e32 v9, v9, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v10, v10, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_min_f32_e32 v11, v11, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v12, v12, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_min_f32_e32 v13, v13, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v14, v14, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_min_f32_e32 v15, v15, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v16, v16, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_min_f32_e32 v17, v17, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v18, v18, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_min_f32_e32 v19, v19, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v20, v20, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_min_f32_e32 v21, v21, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v22, v22, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_min_f32_e32 v23, v23, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v24, v24, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_min_f32_e32 v25, v25, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v26, v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_min_f32_e32 v27, v27, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_min_f32_e32 v28, v28, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_min_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_min_f32_e32 v30, v30, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT: v_min_f32_e32 v31, v31, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v8, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v9, v16, v16
+; GFX8-NEXT: v_min_f16_e32 v24, v8, v9
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v16, v0, v8
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v8, v17, v17
+; GFX8-NEXT: v_min_f16_e32 v25, v0, v8
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v17, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v18, v18
+; GFX8-NEXT: v_min_f16_e32 v26, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v18, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v19, v19
+; GFX8-NEXT: v_min_f16_e32 v27, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v19, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v1, v20, v20
+; GFX8-NEXT: v_min_f16_e32 v8, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v9, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v21, v21
+; GFX8-NEXT: v_min_f16_e32 v10, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v11, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v1, v22, v22
+; GFX8-NEXT: v_min_f16_e32 v12, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v13, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v1, v23, v23
+; GFX8-NEXT: v_min_f16_e32 v14, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v15, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v24
+; GFX8-NEXT: v_mov_b32_e32 v1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, v25
+; GFX8-NEXT: v_mov_b32_e32 v3, v17
+; GFX8-NEXT: v_mov_b32_e32 v4, v26
+; GFX8-NEXT: v_mov_b32_e32 v5, v18
+; GFX8-NEXT: v_mov_b32_e32 v6, v27
+; GFX8-NEXT: v_mov_b32_e32 v7, v19
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v8, v16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v8, v17, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v16, v1, v8
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v18, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v18, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v2, v19, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v17, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v4, v4
+; GFX9-NEXT: v_pk_max_f16 v2, v20, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v8, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX9-NEXT: v_pk_max_f16 v2, v21, v21
+; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_min_f16 v10, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v6, v6
+; GFX9-NEXT: v_pk_max_f16 v2, v22, v22
+; GFX9-NEXT: v_pk_min_f16 v12, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v7, v7
+; GFX9-NEXT: v_pk_max_f16 v2, v23, v23
+; GFX9-NEXT: v_pk_min_f16 v14, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT: v_mov_b32_e32 v2, v16
+; GFX9-NEXT: v_mov_b32_e32 v4, v18
+; GFX9-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v8, v16, v16
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v9, v17, v17
+; GFX10-NEXT: v_pk_max_f16 v10, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v11, v18, v18
+; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_min_f16 v2, v1, v9
+; GFX10-NEXT: v_pk_min_f16 v16, v10, v11
+; GFX10-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v19, v19
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v8, v20, v20
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v9, v21, v21
+; GFX10-NEXT: v_pk_max_f16 v11, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v12, v22, v22
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_max_f16 v13, v23, v23
+; GFX10-NEXT: v_pk_min_f16 v6, v1, v3
+; GFX10-NEXT: v_pk_min_f16 v8, v4, v8
+; GFX10-NEXT: v_pk_min_f16 v10, v5, v9
+; GFX10-NEXT: v_pk_min_f16 v12, v11, v12
+; GFX10-NEXT: v_pk_min_f16 v14, v7, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_mov_b32_e32 v4, v16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+ ret <32 x bfloat> %op
+}
+
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_maxnum_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_max_f32_e32 v0, v0, v2
+; GCN-NEXT: v_max_f32_e32 v1, v1, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_max_f32_e32 v0, v0, v3
+; GCN-NEXT: v_max_f32_e32 v1, v1, v4
+; GCN-NEXT: v_max_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_max_f32_e32 v0, v0, v4
+; GCN-NEXT: v_max_f32_e32 v1, v1, v5
+; GCN-NEXT: v_max_f32_e32 v2, v2, v6
+; GCN-NEXT: v_max_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v3, v1, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: v_max_f32_e32 v0, v0, v8
+; GCN-NEXT: v_max_f32_e32 v1, v1, v9
+; GCN-NEXT: v_max_f32_e32 v2, v2, v10
+; GCN-NEXT: v_max_f32_e32 v3, v3, v11
+; GCN-NEXT: v_max_f32_e32 v4, v4, v12
+; GCN-NEXT: v_max_f32_e32 v5, v5, v13
+; GCN-NEXT: v_max_f32_e32 v6, v6, v14
+; GCN-NEXT: v_max_f32_e32 v7, v7, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v6, v2, v3
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v2, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v2, v5, v5
+; GFX9-NEXT: v_pk_max_f16 v2, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v2, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v3, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_max_f32_e32 v0, v0, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT: v_max_f32_e32 v1, v1, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GCN-NEXT: v_max_f32_e32 v2, v2, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GCN-NEXT: v_max_f32_e32 v3, v3, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
+; GCN-NEXT: v_max_f32_e32 v4, v4, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
+; GCN-NEXT: v_max_f32_e32 v5, v5, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
+; GCN-NEXT: v_max_f32_e32 v6, v6, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
+; GCN-NEXT: v_max_f32_e32 v7, v7, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
+; GCN-NEXT: v_max_f32_e32 v8, v8, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
+; GCN-NEXT: v_max_f32_e32 v9, v9, v16
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT: v_max_f32_e32 v10, v10, v16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
+; GCN-NEXT: v_max_f32_e32 v11, v11, v17
+; GCN-NEXT: v_max_f32_e32 v12, v12, v18
+; GCN-NEXT: v_max_f32_e32 v13, v13, v19
+; GCN-NEXT: v_max_f32_e32 v14, v14, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: v_max_f32_e32 v15, v15, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v5, v8, v8
+; GFX8-NEXT: v_max_f16_e32 v12, v4, v5
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v4, v9, v9
+; GFX8-NEXT: v_max_f16_e32 v13, v0, v4
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v9, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v10, v10
+; GFX8-NEXT: v_max_f16_e32 v4, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v5, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v11, v11
+; GFX8-NEXT: v_max_f16_e32 v6, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v7, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v12
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v3, v9
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v4, v8, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v4, v9, v9
+; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v8, v1, v4
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v10, v10
+; GFX9-NEXT: v_pk_max_f16 v4, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v2, v11, v11
+; GFX9-NEXT: v_pk_max_f16 v6, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v4, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v5, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v6, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v7, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v5
+; GFX10-NEXT: v_pk_max_f16 v4, v6, v7
+; GFX10-NEXT: v_pk_max_f16 v6, v3, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_max_f32_e32 v0, v0, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_max_f32_e32 v1, v1, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v2, v2, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_max_f32_e32 v3, v3, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v4, v4, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_max_f32_e32 v5, v5, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v6, v6, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_max_f32_e32 v7, v7, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v8, v8, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_max_f32_e32 v9, v9, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v10, v10, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_max_f32_e32 v11, v11, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v12, v12, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_max_f32_e32 v13, v13, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v14, v14, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_max_f32_e32 v15, v15, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v16, v16, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_max_f32_e32 v17, v17, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v18, v18, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_max_f32_e32 v19, v19, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v20, v20, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_max_f32_e32 v21, v21, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v22, v22, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_max_f32_e32 v23, v23, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v24, v24, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_max_f32_e32 v25, v25, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v26, v26, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_max_f32_e32 v27, v27, v31
+; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GCN-NEXT: v_max_f32_e32 v28, v28, v32
+; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_max_f32_e32 v29, v29, v31
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GCN-NEXT: v_max_f32_e32 v30, v30, v31
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
+; GCN-NEXT: v_max_f32_e32 v31, v31, v32
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v8, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v9, v16, v16
+; GFX8-NEXT: v_max_f16_e32 v24, v8, v9
+; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v16, v0, v8
+; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v8, v17, v17
+; GFX8-NEXT: v_max_f16_e32 v25, v0, v8
+; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v17, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v18, v18
+; GFX8-NEXT: v_max_f16_e32 v26, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v18, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v1, v19, v19
+; GFX8-NEXT: v_max_f16_e32 v27, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v19, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v1, v20, v20
+; GFX8-NEXT: v_max_f16_e32 v8, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v9, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v1, v21, v21
+; GFX8-NEXT: v_max_f16_e32 v10, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v11, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v1, v22, v22
+; GFX8-NEXT: v_max_f16_e32 v12, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v13, v0, v1
+; GFX8-NEXT: v_max_f16_e32 v0, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v1, v23, v23
+; GFX8-NEXT: v_max_f16_e32 v14, v0, v1
+; GFX8-NEXT: v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v15, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, v24
+; GFX8-NEXT: v_mov_b32_e32 v1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, v25
+; GFX8-NEXT: v_mov_b32_e32 v3, v17
+; GFX8-NEXT: v_mov_b32_e32 v4, v26
+; GFX8-NEXT: v_mov_b32_e32 v5, v18
+; GFX8-NEXT: v_mov_b32_e32 v6, v27
+; GFX8-NEXT: v_mov_b32_e32 v7, v19
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v8, v16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: v_pk_max_f16 v8, v17, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v16, v1, v8
+; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX9-NEXT: v_pk_max_f16 v2, v18, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v18, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX9-NEXT: v_pk_max_f16 v2, v19, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
+; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v17, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v4, v4
+; GFX9-NEXT: v_pk_max_f16 v2, v20, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
+; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v8, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v5, v5
+; GFX9-NEXT: v_pk_max_f16 v2, v21, v21
+; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_max_f16 v10, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v6, v6
+; GFX9-NEXT: v_pk_max_f16 v2, v22, v22
+; GFX9-NEXT: v_pk_max_f16 v12, v1, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v7, v7
+; GFX9-NEXT: v_pk_max_f16 v2, v23, v23
+; GFX9-NEXT: v_pk_max_f16 v14, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT: v_mov_b32_e32 v2, v16
+; GFX9-NEXT: v_mov_b32_e32 v4, v18
+; GFX9-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
+; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v8, v16, v16
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v9, v17, v17
+; GFX10-NEXT: v_pk_max_f16 v10, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v11, v18, v18
+; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_max_f16 v2, v1, v9
+; GFX10-NEXT: v_pk_max_f16 v16, v10, v11
+; GFX10-NEXT: v_pk_max_f16 v1, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v3, v19, v19
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v8, v20, v20
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v9, v21, v21
+; GFX10-NEXT: v_pk_max_f16 v11, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v12, v22, v22
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_max_f16 v13, v23, v23
+; GFX10-NEXT: v_pk_max_f16 v6, v1, v3
+; GFX10-NEXT: v_pk_max_f16 v8, v4, v8
+; GFX10-NEXT: v_pk_max_f16 v10, v5, v9
+; GFX10-NEXT: v_pk_max_f16 v12, v11, v12
+; GFX10-NEXT: v_pk_max_f16 v14, v7, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX10-NEXT: v_mov_b32_e32 v4, v16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+ ret <32 x bfloat> %op
+}
+
+declare bfloat @llvm.sqrt.bf16(bfloat)
+
+define bfloat @v_sqrt_bf16(bfloat %a) {
+; GCN-LABEL: v_sqrt_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sqrt_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sqrt_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sqrt_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sqrt_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
+
+define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
+; GCN-LABEL: v_ldexp_bf16_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ldexp_bf16_i32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ldexp_bf16_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX8-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX8-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ldexp_bf16_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff8000
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff
+; GFX9-NEXT: v_med3_i32 v1, v1, v2, v3
+; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ldexp_bf16_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
+ ret bfloat %op
+}
+
+declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
+
+define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
+; GCN-LABEL: v_frexp_bf16_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
+; GCN-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GCN-NEXT: v_frexp_exp_i32_f32_e32 v3, v0
+; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_frexp_bf16_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_frexp_bf16_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_frexp_mant_f16_e32 v2, v0
+; GFX8-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_frexp_bf16_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_frexp_mant_f16_e32 v2, v0
+; GFX9-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_frexp_bf16_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_frexp_mant_f16_e32 v2, v0
+; GFX10-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
+ ret { bfloat, i16 } %op
+}
+
+
+declare bfloat @llvm.log.bf16(bfloat)
+declare bfloat @llvm.log2.bf16(bfloat)
+declare bfloat @llvm.log10.bf16(bfloat)
+
+define bfloat @v_log_bf16(bfloat %a) {
+; GCN-LABEL: v_log_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_log_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_log_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x398c, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.log.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+define bfloat @v_log2_bf16(bfloat %a) {
+; GCN-LABEL: v_log2_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_log_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log2_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_log_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log2_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log2_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log2_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.log2.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+define bfloat @v_log10_bf16(bfloat %a) {
+; GCN-LABEL: v_log10_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_log_f32_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log10_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_log_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log10_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log10_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_log_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log10_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_log_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.log10.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.exp.bf16(bfloat)
+declare bfloat @llvm.exp2.bf16(bfloat)
+declare bfloat @llvm.exp10.bf16(bfloat)
+
+define bfloat @v_exp_bf16(bfloat %a) {
+; GCN-LABEL: v_exp_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT: v_exp_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX7-NEXT: v_exp_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.exp.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+define bfloat @v_exp2_bf16(bfloat %a) {
+; GCN-LABEL: v_exp2_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_exp_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp2_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_exp_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp2_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_exp_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp2_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_exp_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp2_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_exp_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.exp2.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+define bfloat @v_exp10_bf16(bfloat %a) {
+; GCN-LABEL: v_exp10_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GCN-NEXT: v_exp_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp10_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX7-NEXT: v_exp_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp10_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX8-NEXT: v_exp_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp10_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT: v_exp_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp10_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX10-NEXT: v_exp_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.exp10.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.ceil.bf16(bfloat)
+
+define bfloat @v_ceil_bf16(bfloat %a) {
+; GCN-LABEL: v_ceil_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_ceil_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ceil_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_ceil_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ceil_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_ceil_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ceil_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_ceil_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ceil_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_ceil_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.ceil.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.trunc.bf16(bfloat)
+
+define bfloat @v_trunc_bf16(bfloat %a) {
+; GCN-LABEL: v_trunc_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_trunc_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_trunc_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_trunc_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_trunc_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_trunc_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_trunc_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_trunc_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.trunc.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.rint.bf16(bfloat)
+
+define bfloat @v_rint_bf16(bfloat %a) {
+; GCN-LABEL: v_rint_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_rndne_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_rint_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rint_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rndne_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rint_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rndne_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rint_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rndne_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.rint.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.nearbyint.bf16(bfloat)
+
+; FIXME: unable to legalize instruction: %2:_(s16) = G_FNEARBYINT %0:_
+; define bfloat @v_nearbyint_bf16(bfloat %a) {
+; %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
+; ret bfloat %op
+; }
+
+declare bfloat @llvm.round.bf16(bfloat)
+
+define bfloat @v_round_bf16(bfloat %a) {
+; GCN-LABEL: v_round_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, 0.5
+; GCN-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT: v_trunc_f32_e32 v4, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v4
+; GCN-NEXT: v_add_f32_e32 v1, v1, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v1, v2
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
+; GCN-NEXT: v_or_b32_e32 v0, v1, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_add_f32_e32 v0, v1, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_round_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX7-NEXT: v_trunc_f32_e32 v2, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v2
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, 0.5
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v1, v3
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_round_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_trunc_f16_e32 v1, v0
+; GFX8-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_round_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f16_e32 v1, v0
+; GFX9-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_round_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_trunc_f16_e32 v1, v0
+; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.round.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.roundeven.bf16(bfloat)
+
+define bfloat @v_roundeven_bf16(bfloat %a) {
+; GCN-LABEL: v_roundeven_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_rndne_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rndne_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rndne_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_roundeven_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rndne_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.floor.bf16(bfloat)
+
+define bfloat @v_floor_bf16(bfloat %a) {
+; GCN-LABEL: v_floor_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_floor_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_floor_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_floor_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_floor_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_floor_f16_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_floor_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_floor_f16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_floor_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_floor_f16_e32 v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.floor.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.canonicalize.bf16(bfloat)
+
+define bfloat @v_canonicalize_bf16(bfloat %a) {
+; GCN-LABEL: v_canonicalize_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_canonicalize_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_canonicalize_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_canonicalize_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_canonicalize_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
+ ret bfloat %op
+}
+
+declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
+
+; FIXME: Promotion broken
+; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
+; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
+; ret bfloat %op
+; }
+
+define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_false_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_false_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, 0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_false_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_false_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_false_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp false bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oeq_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oeq_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oeq_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oeq_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oeq_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp oeq bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ogt_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ogt_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ogt_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ogt_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ogt_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ogt bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oge_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oge_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oge_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oge_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oge_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp oge bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_olt_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_olt_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_olt_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_olt_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_olt_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp olt bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ole_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ole_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ole_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ole_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ole_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ole bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_one_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_one_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_one_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_one_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_one_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp one bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uno_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uno_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uno_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uno_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uno_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp uno bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ueq_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ueq_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ueq_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ueq_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ueq_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ueq bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ugt_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ugt_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ugt_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ugt_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ugt_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ugt bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uge_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uge_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uge_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uge_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uge_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp uge bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ult_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ult_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ult_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ult_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ult_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ult bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ule_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ule_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ule_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ule_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ule_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp ule bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_une_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_une_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_une_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_une_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_une_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp une bfloat %a, %b
+ ret i1 %op
+}
+
+define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_true_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_true_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, 1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_true_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, 1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_true_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_true_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = fcmp true bfloat %a, %b
+ ret i1 %op
+}
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+ ret bfloat %op
+}
+
+; FIXME: unable to lower arguments: ptr
+; define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; ret bfloat %op
+; }
+
+; FIXME: unable to lower arguments: ptr
+; define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
+; %sign = fptrunc float %sign.f32 to bfloat
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
+; %sign = fptrunc double %sign.f64 to bfloat
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; ret bfloat %op
+; }
+
+define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
+; GCN-LABEL: v_copysign_bf16_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %sign = bitcast half %sign.f16 to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+ ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT: s_or_b32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+; FIXME: unable to translate instruction: fptrunc
+; define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
+; %sign = fptrunc float %sign.f32 to bfloat
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; %cast = bitcast bfloat %op to i16
+; %zext = zext i16 %cast to i32
+; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+; ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: fptrunc
+; define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
+; %sign = fptrunc double %sign.f64 to bfloat
+; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+; %cast = bitcast bfloat %op to i16
+; %zext = zext i16 %cast to i32
+; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+; ret i32 %readlane
+; }
+
+define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
+; GCN-LABEL: s_copysign_bf16_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT: s_or_b32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %sign = bitcast half %sign.f16 to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+declare float @llvm.copysign.f32(float, float)
+
+; FIXME: unable to translate instruction: fpext
+; define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
+; %sign = fpext bfloat %sign.bf16 to float
+; %op = call float @llvm.copysign.f32(float %mag, float %sign)
+; ret float %op
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
+; %sign = fpext bfloat %sign.bf16 to float
+; %op = call float @llvm.copysign.f32(float %mag, float %sign)
+; %cast = bitcast float %op to i32
+; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+; ret i32 %readlane
+; }
+
+declare half @llvm.copysign.f16(half, half)
+
+define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
+; GCN-LABEL: v_copysign_f16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_f16_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_f16_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_f16_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %sign = bitcast bfloat %sign.bf16 to half
+ %op = call half @llvm.copysign.f16(half %mag, half %sign)
+ ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
+; GCN-LABEL: s_copysign_f16_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: s_and_b32 s0, 0xffff, s0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_f16_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX7-NEXT: s_or_b32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_f16_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_f16_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: ; return to shader part epilog
+ %sign = bitcast bfloat %sign.bf16 to half
+ %op = call half @llvm.copysign.f16(half %mag, half %sign)
+ %cast = bitcast half %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+declare double @llvm.copysign.f64(double, double)
+
+; FIXME: unable to translate instruction: fpext
+; define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
+; %sign = fpext bfloat %sign.bf16 to double
+; %op = call double @llvm.copysign.f64(double %mag, double %sign)
+; ret double %op
+; }
+
+; FIXME: unable to translate instruction: fpext
+; define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
+; %sign = fpext bfloat %sign.bf16 to double
+; %op = call double @llvm.copysign.f64(double %mag, double %sign)
+; %cast = bitcast double %op to <2 x i32>
+; %cast.0 = extractelement <2 x i32> %cast, i32 0
+; %cast.1 = extractelement <2 x i32> %cast, i32 1
+; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
+; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
+; %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+; %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
+; ret <2 x i32> %ins.1
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
+; %op = fptosi bfloat %x to i16
+; ret i16 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
+; %op = fptosi <2 x bfloat> %x to <2 x i16>
+; ret <2 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
+; %op = fptosi <3 x bfloat> %x to <3 x i16>
+; ret <3 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
+; %op = fptosi <4 x bfloat> %x to <4 x i16>
+; ret <4 x i16> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
+; %op = fptosi bfloat %x to i32
+; ret i32 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
+; %op = fptosi <2 x bfloat> %x to <2 x i32>
+; ret <2 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
+; %op = fptosi <3 x bfloat> %x to <3 x i32>
+; ret <3 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
+; %op = fptosi <4 x bfloat> %x to <4 x i32>
+; ret <4 x i32> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
+; %op = fptosi bfloat %x to i64
+; ret i64 %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
+; %op = fptosi <2 x bfloat> %x to <2 x i64>
+; ret <2 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
+; %op = fptosi <3 x bfloat> %x to <3 x i64>
+; ret <3 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: fptosi
+; define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
+; %op = fptosi <4 x bfloat> %x to <4 x i64>
+; ret <4 x i64> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
+; %op = sitofp i16 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+; %op = sitofp <2 x i16> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+; %op = sitofp <3 x i16> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+; %op = sitofp <4 x i16> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
+; %op = sitofp i32 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+; %op = sitofp <2 x i32> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+; %op = sitofp <3 x i32> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+; %op = sitofp <4 x i32> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
+; %op = sitofp i64 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+; %op = sitofp <2 x i64> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+; %op = sitofp <3 x i64> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: sitofp
+; define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+; %op = sitofp <4 x i64> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
+; %op = uitofp i16 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+; %op = uitofp <2 x i16> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+; %op = uitofp <3 x i16> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+; %op = uitofp <4 x i16> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
+; %op = uitofp i32 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+; %op = uitofp <2 x i32> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+; %op = uitofp <3 x i32> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+; %op = uitofp <4 x i32> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
+; %op = uitofp i64 %x to bfloat
+; ret bfloat %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+; %op = uitofp <2 x i64> %x to <2 x bfloat>
+; ret <2 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+; %op = uitofp <3 x i64> %x to <3 x bfloat>
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: uitofp
+; define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+; %op = uitofp <4 x i64> %x to <4 x bfloat>
+; ret <4 x bfloat> %op
+; }
+
+define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = select i1 %cond, bfloat %a, bfloat %b
+ ret bfloat %op
+}
+
+define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_lhs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_lhs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_lhs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_lhs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_lhs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg bfloat %a
+ %op = select i1 %cond, bfloat %neg.a, bfloat %b
+ ret bfloat %op
+}
+
+define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_rhs_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_rhs_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_rhs_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_rhs_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_rhs_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg bfloat %b
+ %op = select i1 %cond, bfloat %a, bfloat %neg.b
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_select_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_or_b32_e32 v1, v2, v1
+; GCN-NEXT: v_or_b32_e32 v2, v4, v3
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: v4bf16:
+; GFX7-LABEL: v_select_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+ ret <2 x bfloat> %op
+}
+
+define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+ ret <2 x bfloat> %op
+}
+
+define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
+; GCN-LABEL: s_select_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v4, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+ %cond = icmp eq i32 %c, 0
+ %op = select i1 %cond, bfloat %a, bfloat %b
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
+}
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
+; %cond = icmp eq i32 %c, 0
+; %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+; %cast = bitcast <2 x bfloat> %op to i32
+; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+; ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
+; %cond = icmp eq <2 x i32> %c, zeroinitializer
+; %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+; %cast = bitcast <2 x bfloat> %op to i32
+; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+; ret i32 %readlane
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
+; %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+; ret <3 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
+; %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
+; ret <6 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
+; ret <8 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+; %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
+; ret <16 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+; %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
+; ret <32 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
+; %cond = icmp eq i32 %c, 0
+; %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+; %cast = bitcast <3 x bfloat> %op to i48
+; %elt0 = trunc i48 %cast to i32
+; %elt1.hi = lshr i48 %cast, 32
+; %elt1 = trunc i48 %elt1.hi to i32
+; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+; ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
+; %cond = icmp eq i32 %c, 0
+; %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+; %cast = bitcast <4 x bfloat> %op to <2 x i32>
+; %elt0 = extractelement <2 x i32> %cast, i32 0
+; %elt1 = extractelement <2 x i32> %cast, i32 1
+; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+; ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
+; %cond = icmp eq <4 x i32> %c, zeroinitializer
+; %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+; %cast = bitcast <4 x bfloat> %op to <2 x i32>
+; %elt0 = extractelement <2 x i32> %cast, i32 0
+; %elt1 = extractelement <2 x i32> %cast, i32 1
+; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+; ret <2 x i32> %bv.1
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+; ret <4 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
+; ret <8 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+; %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
+; ret <16 x bfloat> %op
+; }
+
+; FIXME: unable to translate instruction: bitcast
+; define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+; %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
+; ret <32 x bfloat> %op
+; }
+
+declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fma_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v4bf16:
+; GFX8-LABEL: v_fma_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fma_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
+; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_fma_f32 v0, v0, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fma_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
+; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
+; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_fma_f32 v1, v1, v3, v4
+; GFX7-NEXT: v_fma_f32 v2, v2, v5, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
+; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v4bf16:
+; GFX9-LABEL: v_fma_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_bfi_b32 v2, s4, v4, v4
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v4bf16:
+; GFX10-LABEL: v_fma_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_bfi_b32 v2, 0xffff, v4, v4
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fma_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
+; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
+; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
+; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v11
+; GFX7-NEXT: v_fma_f32 v2, v2, v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_fma_f32 v3, v3, v6, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
+; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+ ret <4 x bfloat> %op
+}
+
+declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fmuladd_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_add_f32_e32 v0, v0, v4
+; GCN-NEXT: v_add_f32_e32 v1, v1, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v3, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v3, v2
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_add_f32_e32 v0, v0, v6
+; GCN-NEXT: v_add_f32_e32 v1, v1, v7
+; GCN-NEXT: v_add_f32_e32 v2, v2, v8
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v0, v1, v4
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: v_bfi_b32 v1, s4, v4, v4
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v4, v4
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT: v_add_f32_e32 v0, v0, v8
+; GCN-NEXT: v_add_f32_e32 v1, v1, v9
+; GCN-NEXT: v_add_f32_e32 v2, v2, v10
+; GCN-NEXT: v_add_f32_e32 v3, v3, v11
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v0, v1, v4
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v4bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
- ret <4 x bfloat> %res
+; GFX10-LABEL: v_fmuladd_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT: v_mov_b32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+ ret <4 x bfloat> %op
}
>From 15a5665085625087e74ee24d9cb736bfe2dde053 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 14:05:46 +0100
Subject: [PATCH 3/4] add specific test
---
llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 2 +
.../AMDGPU/GlobalISel/irtranslate-bf16.ll | 376 ++++++++++++++++++
2 files changed, 378 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
index aaefb634b132aa..ba292b4d046f88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -9,6 +9,8 @@
; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+; TODO: Once all cases are working, merge with bf16.ll in parent directory.
+
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store:
; GCN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
new file mode 100644
index 00000000000000..3206f8e55f44eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+
+; tests bf16 argument & return values lowering.
+
+define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v3bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> <i32 3, i32 1, i32 2>
+ ret <3 x bfloat> %res
+}
+
+define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v4bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+ %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x bfloat> %res
+}
+
+define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v5bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4>
+ ret <5 x bfloat> %res
+}
+
+define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v6bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5>
+ ret <6 x bfloat> %res
+}
+
+define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v7bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
+ ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+ ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+ ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6>
+ ret <7 x bfloat> %res
+}
+
+define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v8bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<8 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7)
+ ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+ ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+ ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %res = shufflevector <8 x bfloat> %arg0, <8 x bfloat> zeroinitializer, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x bfloat> %res
+}
+
+define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v16bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+ ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
+ ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+ ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+ ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
+ ; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
+ ; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
+ ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
+ ; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
+ ; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<16 x s16>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>)
+ ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<16 x s16>)
+ ; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
+ ; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
+ ; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
+ ; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
+ ; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
+ ; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
+ ; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
+ ; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT16]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT17]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT18]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT19]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT20]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT21]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT22]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT23]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ret <16 x bfloat> %arg0
+}
+
+define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) {
+ ; GFX9-LABEL: name: v32bf16
+ ; GFX9: bb.1 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+ ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
+ ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+ ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
+ ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
+ ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
+ ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32)
+ ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
+ ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
+ ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32)
+ ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
+ ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
+ ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32)
+ ; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16)
+ ; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16)
+ ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32)
+ ; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16)
+ ; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
+ ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY8]](s32)
+ ; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
+ ; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
+ ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY9]](s32)
+ ; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
+ ; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
+ ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY10]](s32)
+ ; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
+ ; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
+ ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY11]](s32)
+ ; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
+ ; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
+ ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY12]](s32)
+ ; GFX9-NEXT: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s16)
+ ; GFX9-NEXT: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s16)
+ ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY13]](s32)
+ ; GFX9-NEXT: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s16)
+ ; GFX9-NEXT: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s16)
+ ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY14]](s32)
+ ; GFX9-NEXT: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s16)
+ ; GFX9-NEXT: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s16)
+ ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY15]](s32)
+ ; GFX9-NEXT: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s16)
+ ; GFX9-NEXT: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32), [[ANYEXT16]](s32), [[ANYEXT17]](s32), [[ANYEXT18]](s32), [[ANYEXT19]](s32), [[ANYEXT20]](s32), [[ANYEXT21]](s32), [[ANYEXT22]](s32), [[ANYEXT23]](s32), [[ANYEXT24]](s32), [[ANYEXT25]](s32), [[ANYEXT26]](s32), [[ANYEXT27]](s32), [[ANYEXT28]](s32), [[ANYEXT29]](s32), [[ANYEXT30]](s32), [[ANYEXT31]](s32)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s16>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
+ ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<32 x s16>)
+ ; GFX9-NEXT: [[ANYEXT32:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s16)
+ ; GFX9-NEXT: [[ANYEXT33:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s16)
+ ; GFX9-NEXT: [[ANYEXT34:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s16)
+ ; GFX9-NEXT: [[ANYEXT35:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s16)
+ ; GFX9-NEXT: [[ANYEXT36:%[0-9]+]]:_(s32) = G_ANYEXT [[UV36]](s16)
+ ; GFX9-NEXT: [[ANYEXT37:%[0-9]+]]:_(s32) = G_ANYEXT [[UV37]](s16)
+ ; GFX9-NEXT: [[ANYEXT38:%[0-9]+]]:_(s32) = G_ANYEXT [[UV38]](s16)
+ ; GFX9-NEXT: [[ANYEXT39:%[0-9]+]]:_(s32) = G_ANYEXT [[UV39]](s16)
+ ; GFX9-NEXT: [[ANYEXT40:%[0-9]+]]:_(s32) = G_ANYEXT [[UV40]](s16)
+ ; GFX9-NEXT: [[ANYEXT41:%[0-9]+]]:_(s32) = G_ANYEXT [[UV41]](s16)
+ ; GFX9-NEXT: [[ANYEXT42:%[0-9]+]]:_(s32) = G_ANYEXT [[UV42]](s16)
+ ; GFX9-NEXT: [[ANYEXT43:%[0-9]+]]:_(s32) = G_ANYEXT [[UV43]](s16)
+ ; GFX9-NEXT: [[ANYEXT44:%[0-9]+]]:_(s32) = G_ANYEXT [[UV44]](s16)
+ ; GFX9-NEXT: [[ANYEXT45:%[0-9]+]]:_(s32) = G_ANYEXT [[UV45]](s16)
+ ; GFX9-NEXT: [[ANYEXT46:%[0-9]+]]:_(s32) = G_ANYEXT [[UV46]](s16)
+ ; GFX9-NEXT: [[ANYEXT47:%[0-9]+]]:_(s32) = G_ANYEXT [[UV47]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT32]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT33]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT34]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT35]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT36]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT37]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT38]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT39]](s32)
+ ; GFX9-NEXT: $vgpr8 = COPY [[ANYEXT40]](s32)
+ ; GFX9-NEXT: $vgpr9 = COPY [[ANYEXT41]](s32)
+ ; GFX9-NEXT: $vgpr10 = COPY [[ANYEXT42]](s32)
+ ; GFX9-NEXT: $vgpr11 = COPY [[ANYEXT43]](s32)
+ ; GFX9-NEXT: $vgpr12 = COPY [[ANYEXT44]](s32)
+ ; GFX9-NEXT: $vgpr13 = COPY [[ANYEXT45]](s32)
+ ; GFX9-NEXT: $vgpr14 = COPY [[ANYEXT46]](s32)
+ ; GFX9-NEXT: $vgpr15 = COPY [[ANYEXT47]](s32)
+ ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+ ret <32 x bfloat> %arg0
+}
>From 6d1c3ef39b2a7b5195c2b0a357932e52b8f2e538 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 14:08:18 +0100
Subject: [PATCH 4/4] Remove bf16.ll
---
llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 13792 ------------------
1 file changed, 13792 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
deleted file mode 100644
index ba292b4d046f88..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
+++ /dev/null
@@ -1,13792 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-
-; FIXME: GFX11 cannot select some truncs: %0:vgpr_32(s16) = G_TRUNC %1:vgpr_32(s32)
-; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
-
-; TODO: Once all cases are working, merge with bf16.ll in parent directory.
-
-define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_short v[2:3], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v[2:3], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load bfloat, ptr addrspace(1) %in
- store bfloat %val, ptr addrspace(1) %out
- ret void
-}
-
-define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <2 x bfloat>, ptr addrspace(1) %ptr
- ret <2 x bfloat> %load
-}
-
-define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v2, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <3 x bfloat>, ptr addrspace(1) %ptr
- ret <3 x bfloat> %load
-}
-
-define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GCN-NEXT: v_mov_b32_e32 v0, v4
-; GCN-NEXT: v_mov_b32_e32 v2, v5
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v1
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <4 x bfloat>, ptr addrspace(1) %ptr
- ret <4 x bfloat> %load
-}
-
-define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v6bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8
-; GCN-NEXT: v_mov_b32_e32 v0, v6
-; GCN-NEXT: v_mov_b32_e32 v2, v7
-; GCN-NEXT: v_mov_b32_e32 v4, v8
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v6bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v8
-; GFX7-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-NEXT: v_mov_b32_e32 v2, v7
-; GFX7-NEXT: v_mov_b32_e32 v4, v8
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v6bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx3 v[2:4], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v6bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx3 v[2:4], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v6bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx3 v[2:4], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <6 x bfloat>, ptr addrspace(1) %ptr
- ret <6 x bfloat> %load
-}
-
-define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v11
-; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_mov_b32_e32 v2, v9
-; GCN-NEXT: v_mov_b32_e32 v4, v10
-; GCN-NEXT: v_mov_b32_e32 v6, v11
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v11
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
-; GFX7-NEXT: v_mov_b32_e32 v4, v10
-; GFX7-NEXT: v_mov_b32_e32 v6, v11
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <8 x bfloat>, ptr addrspace(1) %ptr
- ret <8 x bfloat> %load
-}
-
-define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[23:26], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[19:22], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v25
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22
-; GCN-NEXT: v_mov_b32_e32 v0, v23
-; GCN-NEXT: v_mov_b32_e32 v2, v24
-; GCN-NEXT: v_mov_b32_e32 v4, v25
-; GCN-NEXT: v_mov_b32_e32 v6, v26
-; GCN-NEXT: v_mov_b32_e32 v8, v19
-; GCN-NEXT: v_mov_b32_e32 v10, v20
-; GCN-NEXT: v_mov_b32_e32 v12, v21
-; GCN-NEXT: v_mov_b32_e32 v14, v22
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v23
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v24
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v25
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v21
-; GFX7-NEXT: v_mov_b32_e32 v0, v22
-; GFX7-NEXT: v_mov_b32_e32 v2, v23
-; GFX7-NEXT: v_mov_b32_e32 v4, v24
-; GFX7-NEXT: v_mov_b32_e32 v6, v25
-; GFX7-NEXT: v_mov_b32_e32 v8, v18
-; GFX7-NEXT: v_mov_b32_e32 v10, v19
-; GFX7-NEXT: v_mov_b32_e32 v12, v20
-; GFX7-NEXT: v_mov_b32_e32 v14, v21
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v11
-; GFX8-NEXT: v_mov_b32_e32 v0, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NEXT: v_mov_b32_e32 v4, v10
-; GFX8-NEXT: v_mov_b32_e32 v6, v11
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v11
-; GFX9-NEXT: v_mov_b32_e32 v0, v8
-; GFX9-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-NEXT: v_mov_b32_e32 v6, v11
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v11
-; GFX10-NEXT: v_mov_b32_e32 v0, v8
-; GFX10-NEXT: v_mov_b32_e32 v2, v9
-; GFX10-NEXT: v_mov_b32_e32 v4, v10
-; GFX10-NEXT: v_mov_b32_e32 v6, v11
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <16 x bfloat>, ptr addrspace(1) %ptr
- ret <16 x bfloat> %load
-}
-
-define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[39:42], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v34
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v39
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v40
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v41
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v42
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v48
-; GCN-NEXT: buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v50
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v52
-; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53
-; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55
-; GCN-NEXT: v_mov_b32_e32 v0, v34
-; GCN-NEXT: v_mov_b32_e32 v2, v35
-; GCN-NEXT: v_mov_b32_e32 v4, v36
-; GCN-NEXT: v_mov_b32_e32 v6, v37
-; GCN-NEXT: v_mov_b32_e32 v8, v39
-; GCN-NEXT: v_mov_b32_e32 v10, v40
-; GCN-NEXT: v_mov_b32_e32 v12, v41
-; GCN-NEXT: v_mov_b32_e32 v14, v42
-; GCN-NEXT: v_mov_b32_e32 v16, v48
-; GCN-NEXT: v_mov_b32_e32 v18, v49
-; GCN-NEXT: v_mov_b32_e32 v20, v50
-; GCN-NEXT: v_mov_b32_e32 v22, v51
-; GCN-NEXT: v_mov_b32_e32 v24, v52
-; GCN-NEXT: v_mov_b32_e32 v26, v53
-; GCN-NEXT: v_mov_b32_e32 v28, v54
-; GCN-NEXT: v_mov_b32_e32 v30, v55
-; GCN-NEXT: v_mov_b32_e32 v1, v38
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[38:41], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[48:51], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[34:37], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[52:55], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v40
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v41
-; GFX7-NEXT: v_mov_b32_e32 v4, v40
-; GFX7-NEXT: v_mov_b32_e32 v6, v41
-; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v39
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v48
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v49
-; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v51
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v34
-; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v35
-; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v36
-; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v37
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v52
-; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v53
-; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v54
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v55
-; GFX7-NEXT: v_mov_b32_e32 v0, v38
-; GFX7-NEXT: v_mov_b32_e32 v2, v39
-; GFX7-NEXT: v_mov_b32_e32 v8, v48
-; GFX7-NEXT: v_mov_b32_e32 v10, v49
-; GFX7-NEXT: v_mov_b32_e32 v12, v50
-; GFX7-NEXT: v_mov_b32_e32 v14, v51
-; GFX7-NEXT: v_mov_b32_e32 v16, v34
-; GFX7-NEXT: v_mov_b32_e32 v18, v35
-; GFX7-NEXT: v_mov_b32_e32 v20, v36
-; GFX7-NEXT: v_mov_b32_e32 v22, v37
-; GFX7-NEXT: v_mov_b32_e32 v24, v52
-; GFX7-NEXT: v_mov_b32_e32 v26, v53
-; GFX7-NEXT: v_mov_b32_e32 v28, v54
-; GFX7-NEXT: v_mov_b32_e32 v30, v55
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[22:25], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[18:21], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v23
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v24
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v25
-; GFX8-NEXT: v_mov_b32_e32 v0, v22
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v21
-; GFX8-NEXT: v_mov_b32_e32 v2, v23
-; GFX8-NEXT: v_mov_b32_e32 v4, v24
-; GFX8-NEXT: v_mov_b32_e32 v6, v25
-; GFX8-NEXT: v_mov_b32_e32 v8, v18
-; GFX8-NEXT: v_mov_b32_e32 v10, v19
-; GFX8-NEXT: v_mov_b32_e32 v12, v20
-; GFX8-NEXT: v_mov_b32_e32 v14, v21
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[22:25], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v25
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v21
-; GFX9-NEXT: v_mov_b32_e32 v0, v22
-; GFX9-NEXT: v_mov_b32_e32 v2, v23
-; GFX9-NEXT: v_mov_b32_e32 v4, v24
-; GFX9-NEXT: v_mov_b32_e32 v6, v25
-; GFX9-NEXT: v_mov_b32_e32 v8, v18
-; GFX9-NEXT: v_mov_b32_e32 v10, v19
-; GFX9-NEXT: v_mov_b32_e32 v12, v20
-; GFX9-NEXT: v_mov_b32_e32 v14, v21
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[22:25], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v23
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v24
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v25
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v21
-; GFX10-NEXT: v_mov_b32_e32 v0, v22
-; GFX10-NEXT: v_mov_b32_e32 v2, v23
-; GFX10-NEXT: v_mov_b32_e32 v4, v24
-; GFX10-NEXT: v_mov_b32_e32 v6, v25
-; GFX10-NEXT: v_mov_b32_e32 v8, v18
-; GFX10-NEXT: v_mov_b32_e32 v10, v19
-; GFX10-NEXT: v_mov_b32_e32 v12, v20
-; GFX10-NEXT: v_mov_b32_e32 v14, v21
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <32 x bfloat>, ptr addrspace(1) %ptr
- ret <32 x bfloat> %load
-}
-
-define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_load_global_v64bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 4, v0
-; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 8, v0
-; GCN-NEXT: buffer_store_dword v23, v21, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 12, v0
-; GCN-NEXT: buffer_store_dword v24, v21, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0
-; GCN-NEXT: s_waitcnt expcnt(2)
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v0
-; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 24, v0
-; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0
-; GCN-NEXT: buffer_store_dword v26, v22, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 32, v0
-; GCN-NEXT: buffer_store_dword v27, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 36, v0
-; GCN-NEXT: buffer_store_dword v28, v21, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
-; GCN-NEXT: s_waitcnt expcnt(4)
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 44, v0
-; GCN-NEXT: s_waitcnt expcnt(3)
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0
-; GCN-NEXT: s_waitcnt expcnt(2)
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0
-; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0
-; GCN-NEXT: s_waitcnt vmcnt(13)
-; GCN-NEXT: buffer_store_dword v29, v22, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0
-; GCN-NEXT: buffer_store_dword v30, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0
-; GCN-NEXT: buffer_store_dword v31, v21, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v21, 0x44
-; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v24, 0x48
-; GCN-NEXT: s_waitcnt expcnt(4)
-; GCN-NEXT: v_mov_b32_e32 v28, 0x4c
-; GCN-NEXT: s_waitcnt expcnt(3)
-; GCN-NEXT: v_mov_b32_e32 v29, 0x50
-; GCN-NEXT: s_waitcnt expcnt(2)
-; GCN-NEXT: v_mov_b32_e32 v30, 0x54
-; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: v_mov_b32_e32 v31, 0x58
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v17, 0x5c
-; GCN-NEXT: buffer_store_dword v18, v26, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v18, 0x60
-; GCN-NEXT: buffer_store_dword v19, v27, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v19, 0x64
-; GCN-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v20, 0x68
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x6c, v0
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x70, v0
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x74, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
-; GCN-NEXT: v_add_i32_e32 v21, vcc, v0, v21
-; GCN-NEXT: v_add_i32_e32 v24, vcc, v0, v24
-; GCN-NEXT: v_add_i32_e32 v28, vcc, v0, v28
-; GCN-NEXT: v_add_i32_e32 v29, vcc, v0, v29
-; GCN-NEXT: v_add_i32_e32 v30, vcc, v0, v30
-; GCN-NEXT: v_add_i32_e32 v31, vcc, v0, v31
-; GCN-NEXT: v_add_i32_e32 v17, vcc, v0, v17
-; GCN-NEXT: v_add_i32_e32 v18, vcc, v0, v18
-; GCN-NEXT: v_add_i32_e32 v19, vcc, v0, v19
-; GCN-NEXT: v_add_i32_e32 v20, vcc, v0, v20
-; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; GCN-NEXT: buffer_store_dword v13, v23, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v21, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v17, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: buffer_store_dword v5, v18, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v7, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v27, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_load_global_v64bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[21:24], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[25:28], v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[29:32], v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[13:16], v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: buffer_load_dwordx4 v[17:20], v[1:2], s[4:7], 0 addr64 offset:64
-; GFX7-NEXT: buffer_load_dwordx4 v[9:12], v[1:2], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: buffer_load_dwordx4 v[5:8], v[1:2], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[4:7], 0 addr64 offset:112
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, 4, v0
-; GFX7-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v23, v21, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, 12, v0
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v24, v21, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 20, v0
-; GFX7-NEXT: buffer_store_dword v26, v23, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 24, v0
-; GFX7-NEXT: buffer_store_dword v27, v23, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 28, v0
-; GFX7-NEXT: v_add_i32_e32 v26, vcc, 32, v0
-; GFX7-NEXT: buffer_store_dword v28, v23, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v27, vcc, 36, v0
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: buffer_store_dword v29, v26, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v26, vcc, 40, v0
-; GFX7-NEXT: v_mov_b32_e32 v21, 0x44
-; GFX7-NEXT: v_mov_b32_e32 v22, 0x48
-; GFX7-NEXT: v_mov_b32_e32 v23, 0x4c
-; GFX7-NEXT: v_mov_b32_e32 v24, 0x50
-; GFX7-NEXT: v_mov_b32_e32 v25, 0x54
-; GFX7-NEXT: buffer_store_dword v30, v27, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v27, vcc, 44, v0
-; GFX7-NEXT: buffer_store_dword v31, v26, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v26, vcc, 48, v0
-; GFX7-NEXT: buffer_store_dword v32, v27, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v27, vcc, 52, v0
-; GFX7-NEXT: v_add_i32_e32 v28, vcc, 56, v0
-; GFX7-NEXT: v_add_i32_e32 v29, vcc, 60, v0
-; GFX7-NEXT: v_add_i32_e32 v30, vcc, 64, v0
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, v0, v21
-; GFX7-NEXT: v_add_i32_e32 v22, vcc, v0, v22
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, v0, v23
-; GFX7-NEXT: v_add_i32_e32 v24, vcc, v0, v24
-; GFX7-NEXT: v_add_i32_e32 v25, vcc, v0, v25
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v15, v28, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v16, v29, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v30, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x58
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x5c
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v9, 0x60
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, v0, v9
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x64
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x68
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0
-; GFX7-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0
-; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0
-; GFX7-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
-; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_load_global_v64bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: flat_load_dwordx4 v[38:41], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[48:51], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[34:37], v[2:3]
-; GFX8-NEXT: flat_load_dwordx4 v[52:55], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v40
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v41
-; GFX8-NEXT: v_mov_b32_e32 v4, v40
-; GFX8-NEXT: v_mov_b32_e32 v6, v41
-; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v39
-; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v48
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v49
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v50
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v51
-; GFX8-NEXT: v_mov_b32_e32 v0, v38
-; GFX8-NEXT: v_mov_b32_e32 v2, v39
-; GFX8-NEXT: v_mov_b32_e32 v8, v48
-; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v34
-; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v35
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v36
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v37
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v52
-; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v53
-; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v54
-; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v55
-; GFX8-NEXT: v_mov_b32_e32 v10, v49
-; GFX8-NEXT: v_mov_b32_e32 v12, v50
-; GFX8-NEXT: v_mov_b32_e32 v14, v51
-; GFX8-NEXT: v_mov_b32_e32 v16, v34
-; GFX8-NEXT: v_mov_b32_e32 v18, v35
-; GFX8-NEXT: v_mov_b32_e32 v20, v36
-; GFX8-NEXT: v_mov_b32_e32 v22, v37
-; GFX8-NEXT: v_mov_b32_e32 v24, v52
-; GFX8-NEXT: v_mov_b32_e32 v26, v53
-; GFX8-NEXT: v_mov_b32_e32 v28, v54
-; GFX8-NEXT: v_mov_b32_e32 v30, v55
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-NEXT: global_load_dwordx4 v[38:41], v[0:1], off
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[34:37], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v40
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v41
-; GFX9-NEXT: v_mov_b32_e32 v4, v40
-; GFX9-NEXT: v_mov_b32_e32 v6, v41
-; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v38
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v39
-; GFX9-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v48
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v49
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v50
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v51
-; GFX9-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v34
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v35
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v36
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v37
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v52
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v53
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v54
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v55
-; GFX9-NEXT: v_mov_b32_e32 v0, v38
-; GFX9-NEXT: v_mov_b32_e32 v2, v39
-; GFX9-NEXT: v_mov_b32_e32 v8, v48
-; GFX9-NEXT: v_mov_b32_e32 v10, v49
-; GFX9-NEXT: v_mov_b32_e32 v12, v50
-; GFX9-NEXT: v_mov_b32_e32 v14, v51
-; GFX9-NEXT: v_mov_b32_e32 v16, v34
-; GFX9-NEXT: v_mov_b32_e32 v18, v35
-; GFX9-NEXT: v_mov_b32_e32 v20, v36
-; GFX9-NEXT: v_mov_b32_e32 v22, v37
-; GFX9-NEXT: v_mov_b32_e32 v24, v52
-; GFX9-NEXT: v_mov_b32_e32 v26, v53
-; GFX9-NEXT: v_mov_b32_e32 v28, v54
-; GFX9-NEXT: v_mov_b32_e32 v30, v55
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_load_global_v64bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dwordx4 v[64:67], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[34:37], v[0:1], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v64
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v65
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v66
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v67
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v48
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v49
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v50
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v51
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v34
-; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v35
-; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v36
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v37
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v52
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v53
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v54
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v55
-; GFX10-NEXT: v_mov_b32_e32 v0, v64
-; GFX10-NEXT: v_mov_b32_e32 v2, v65
-; GFX10-NEXT: v_mov_b32_e32 v4, v66
-; GFX10-NEXT: v_mov_b32_e32 v6, v67
-; GFX10-NEXT: v_mov_b32_e32 v8, v48
-; GFX10-NEXT: v_mov_b32_e32 v10, v49
-; GFX10-NEXT: v_mov_b32_e32 v12, v50
-; GFX10-NEXT: v_mov_b32_e32 v14, v51
-; GFX10-NEXT: v_mov_b32_e32 v16, v34
-; GFX10-NEXT: v_mov_b32_e32 v18, v35
-; GFX10-NEXT: v_mov_b32_e32 v20, v36
-; GFX10-NEXT: v_mov_b32_e32 v22, v37
-; GFX10-NEXT: v_mov_b32_e32 v24, v52
-; GFX10-NEXT: v_mov_b32_e32 v26, v53
-; GFX10-NEXT: v_mov_b32_e32 v28, v54
-; GFX10-NEXT: v_mov_b32_e32 v30, v55
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %load = load <64 x bfloat>, ptr addrspace(1) %ptr
- ret <64 x bfloat> %load
-}
-
-define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v[1:2], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <2 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_store_short v[2:3], v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_store_short v[4:5], v6
-; GFX8-NEXT: flat_store_short v[2:3], v1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_short v[2:3], v0, off
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_store_short v[2:3], v0, off
-; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <3 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <4 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <8 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: v_or_b32_e32 v4, v9, v8
-; GCN-NEXT: v_or_b32_e32 v5, v11, v10
-; GCN-NEXT: v_or_b32_e32 v6, v13, v12
-; GCN-NEXT: v_or_b32_e32 v7, v15, v14
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <16 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GCN-NEXT: v_or_b32_e32 v2, v2, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GCN-NEXT: v_or_b32_e32 v3, v3, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v10
-; GCN-NEXT: v_or_b32_e32 v4, v4, v5
-; GCN-NEXT: v_or_b32_e32 v5, v6, v7
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v12
-; GCN-NEXT: v_or_b32_e32 v6, v6, v7
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GCN-NEXT: v_or_b32_e32 v7, v7, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v16
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v18
-; GCN-NEXT: v_or_b32_e32 v8, v8, v9
-; GCN-NEXT: v_or_b32_e32 v9, v10, v11
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20
-; GCN-NEXT: v_or_b32_e32 v10, v10, v11
-; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v22
-; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v25
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v24
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v27
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v26
-; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v29
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v28
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v30
-; GCN-NEXT: v_or_b32_e32 v11, v11, v12
-; GCN-NEXT: v_or_b32_e32 v12, v13, v14
-; GCN-NEXT: v_or_b32_e32 v13, v15, v16
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32
-; GCN-NEXT: v_or_b32_e32 v14, v17, v18
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: v_or_b32_e32 v15, v15, v19
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_or_b32_e32 v1, v3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v10
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v6, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v17
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v16
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v18
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_or_b32_e32 v9, v10, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v20
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v22
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v25
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v24
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v27
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v26
-; GFX7-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX7-NEXT: v_or_b32_e32 v12, v13, v14
-; GFX7-NEXT: v_or_b32_e32 v13, v15, v16
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v29
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v28
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff, v30
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_or_b32_e32 v15, v15, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
-; GFX8-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
-; GFX8-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
-; GFX8-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v15
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
-; GFX8-NEXT: v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX9-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10
-; GFX9-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX9-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v13
-; GFX9-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX9-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15
-; GFX9-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v11, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v12, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v13, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v14, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v15, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
-; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
-; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v14
-; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX10-NEXT: v_mov_b32_sdwa v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v8, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v9, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v10, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v11, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v12, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
-; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
-; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <32 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
-; GCN-LABEL: v_store_global_v64bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GCN-NEXT: v_or_b32_e32 v2, v2, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GCN-NEXT: v_or_b32_e32 v3, v3, v4
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_or_b32_e32 v1, v2, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v12
-; GCN-NEXT: v_or_b32_e32 v2, v2, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v14
-; GCN-NEXT: v_or_b32_e32 v3, v3, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v16
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v23
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v22
-; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v25
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v24
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v27
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v26
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:4
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, v6, v7
-; GCN-NEXT: v_or_b32_e32 v1, v8, v9
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_or_b32_e32 v2, v10, v11
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_or_b32_e32 v3, v12, v13
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v29
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v28
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:32
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, v14, v15
-; GCN-NEXT: v_or_b32_e32 v1, v16, v17
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:44
-; GCN-NEXT: v_or_b32_e32 v2, v12, v13
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v30
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18
-; GCN-NEXT: v_or_b32_e32 v3, v16, v3
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68
-; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v19
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_or_b32_e32 v1, v2, v3
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v9
-; GCN-NEXT: v_or_b32_e32 v2, v2, v3
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v11
-; GCN-NEXT: s_waitcnt vmcnt(12)
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v14
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v15
-; GCN-NEXT: v_or_b32_e32 v3, v3, v6
-; GCN-NEXT: v_or_b32_e32 v6, v7, v8
-; GCN-NEXT: v_or_b32_e32 v7, v10, v11
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92
-; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v16
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v13
-; GCN-NEXT: v_or_b32_e32 v8, v8, v10
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100
-; GCN-NEXT: s_waitcnt vmcnt(9)
-; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v17
-; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v19
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v22
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v23
-; GCN-NEXT: v_or_b32_e32 v9, v10, v11
-; GCN-NEXT: v_or_b32_e32 v10, v16, v17
-; GCN-NEXT: v_or_b32_e32 v11, v18, v19
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v24
-; GCN-NEXT: v_or_b32_e32 v12, v12, v18
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GCN-NEXT: v_or_b32_e32 v13, v13, v14
-; GCN-NEXT: v_or_b32_e32 v14, v16, v15
-; GCN-NEXT: v_or_b32_e32 v15, v18, v17
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v19
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GCN-NEXT: v_or_b32_e32 v16, v18, v16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GCN-NEXT: v_or_b32_e32 v17, v17, v18
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[4:5], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[4:5], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_store_global_v64bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v35, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v37, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v38, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v8
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v12
-; GFX7-NEXT: v_or_b32_e32 v36, v3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v10
-; GFX7-NEXT: v_or_b32_e32 v33, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v14
-; GFX7-NEXT: v_or_b32_e32 v32, v2, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_or_b32_e32 v34, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v16
-; GFX7-NEXT: v_or_b32_e32 v4, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v19
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v18
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v20
-; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v22
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v25
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v24
-; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v27
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v26
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v29
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v28
-; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v15
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v30
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_or_b32_e32 v8, v9, v8
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v15
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v14
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX7-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:44
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-NEXT: v_or_b32_e32 v10, v12, v10
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: buffer_store_dwordx4 v[35:38], v[24:25], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_store_dwordx4 v[31:34], v[24:25], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:80
-; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
-; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_or_b32_e32 v11, v12, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v17
-; GFX7-NEXT: v_or_b32_e32 v13, v13, v14
-; GFX7-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v18
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v19
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v21
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v20
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v23
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff, v22
-; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff, v26
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96
-; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_or_b32_e32 v17, v17, v18
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v28
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v29
-; GFX7-NEXT: v_or_b32_e32 v18, v18, v23
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112
-; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
-; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116
-; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX7-NEXT: v_or_b32_e32 v19, v20, v19
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v26
-; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT: v_or_b32_e32 v21, v22, v23
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff, v28
-; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff, v29
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_or_b32_e32 v23, v23, v26
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[24:25], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[24:25], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[24:25], s[4:7], 0 addr64 offset:64
-; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[24:25], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: buffer_store_dwordx4 v[16:19], v[24:25], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: buffer_store_dwordx4 v[20:23], v[24:25], s[4:7], 0 addr64 offset:112
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_store_global_v64bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX8-NEXT: v_mov_b32_sdwa v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX8-NEXT: v_mov_b32_sdwa v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX8-NEXT: v_mov_b32_sdwa v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v7
-; GFX8-NEXT: v_mov_b32_sdwa v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u32_e32 v34, vcc, 16, v32
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[4:7]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v9
-; GFX8-NEXT: v_mov_b32_sdwa v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v10
-; GFX8-NEXT: v_mov_b32_sdwa v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v11
-; GFX8-NEXT: v_mov_b32_sdwa v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v12
-; GFX8-NEXT: v_mov_b32_sdwa v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v13
-; GFX8-NEXT: v_mov_b32_sdwa v12, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT: v_mov_b32_sdwa v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v15
-; GFX8-NEXT: v_mov_b32_sdwa v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_mov_b32_sdwa v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v32
-; GFX8-NEXT: v_mov_b32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v32
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11]
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v17
-; GFX8-NEXT: v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX8-NEXT: v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; GFX8-NEXT: v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX8-NEXT: v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v21
-; GFX8-NEXT: v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; GFX8-NEXT: v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GFX8-NEXT: v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v24
-; GFX8-NEXT: v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; GFX8-NEXT: v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; GFX8-NEXT: v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; GFX8-NEXT: v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v28
-; GFX8-NEXT: v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; GFX8-NEXT: v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v30
-; GFX8-NEXT: v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; GFX8-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v32
-; GFX8-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v32, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x60, v32
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x70, v32
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_store_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX9-NEXT: v_mov_b32_sdwa v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1
-; GFX9-NEXT: v_mov_b32_sdwa v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX9-NEXT: v_mov_b32_sdwa v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v3
-; GFX9-NEXT: v_mov_b32_sdwa v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GFX9-NEXT: v_mov_b32_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT: v_mov_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GFX9-NEXT: v_mov_b32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-NEXT: v_mov_b32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
-; GFX9-NEXT: v_mov_b32_sdwa v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT: v_mov_b32_sdwa v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT: v_mov_b32_sdwa v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v13
-; GFX9-NEXT: v_mov_b32_sdwa v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14
-; GFX9-NEXT: v_mov_b32_sdwa v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v15
-; GFX9-NEXT: v_mov_b32_sdwa v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT: v_mov_b32_sdwa v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v17
-; GFX9-NEXT: v_mov_b32_sdwa v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18
-; GFX9-NEXT: v_mov_b32_sdwa v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19
-; GFX9-NEXT: v_mov_b32_sdwa v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX9-NEXT: v_mov_b32_sdwa v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v21
-; GFX9-NEXT: v_mov_b32_sdwa v20, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; GFX9-NEXT: v_mov_b32_sdwa v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GFX9-NEXT: v_mov_b32_sdwa v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24
-; GFX9-NEXT: v_mov_b32_sdwa v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v25
-; GFX9-NEXT: v_mov_b32_sdwa v24, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26
-; GFX9-NEXT: v_mov_b32_sdwa v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; GFX9-NEXT: v_mov_b32_sdwa v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28
-; GFX9-NEXT: v_mov_b32_sdwa v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; GFX9-NEXT: v_mov_b32_sdwa v28, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v30
-; GFX9-NEXT: v_mov_b32_sdwa v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v31
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX9-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_store_global_v64bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX10-NEXT: v_mov_b32_sdwa v0, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v14
-; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v15
-; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v67, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v68, 16, v18
-; GFX10-NEXT: v_mov_b32_sdwa v1, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX10-NEXT: v_mov_b32_sdwa v2, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v21
-; GFX10-NEXT: v_mov_b32_sdwa v3, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX10-NEXT: v_mov_b32_sdwa v4, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v23
-; GFX10-NEXT: v_mov_b32_sdwa v5, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v24
-; GFX10-NEXT: v_mov_b32_sdwa v6, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v25
-; GFX10-NEXT: v_mov_b32_sdwa v7, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v26
-; GFX10-NEXT: v_mov_b32_sdwa v8, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v27
-; GFX10-NEXT: v_mov_b32_sdwa v9, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v28
-; GFX10-NEXT: v_mov_b32_sdwa v10, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v29
-; GFX10-NEXT: v_mov_b32_sdwa v11, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT: v_mov_b32_sdwa v19, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v12, v54 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v13, v55 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v14, v64 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v15, v65 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v16, v66 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v17, v67 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v18, v68 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v20, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v21, v36 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v22, v37 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v23, v38 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v24, v39 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v25, v48 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v26, v49 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v27, v50 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v29, v52 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v30, v53 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v31
-; GFX10-NEXT: v_mov_b32_sdwa v31, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
-; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <64 x bfloat> %val, ptr addrspace(1) %ptr
- ret void
-}
-
-define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
-; GCN-LABEL: test_store_fpimm:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
-; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_store_fpimm:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
-; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_store_fpimm:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GFX8-NEXT: flat_store_short v[0:1], v4
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
-; GFX8-NEXT: flat_store_short v[2:3], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_store_fpimm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GFX9-NEXT: global_store_short v[0:1], v4, off
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
-; GFX9-NEXT: global_store_short v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_store_fpimm:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
-; GFX10-NEXT: global_store_short v[0:1], v4, off
-; GFX10-NEXT: global_store_short v[2:3], v5, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store bfloat 1.0, ptr addrspace(1) %ptr0
- store bfloat 42.0, ptr addrspace(1) %ptr1
- ret void
-}
-
-; FIXME: unable to translate instruction: fptrunc
-; define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; %val = load float, ptr addrspace(1) %in
-; %val.bf16 = fptrunc float %val to bfloat
-; store bfloat %val.bf16, ptr addrspace(1) %out
-; ret void
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; %val = load double, ptr addrspace(1) %in
-; %val.bf16 = fptrunc double %val to bfloat
-; store bfloat %val.bf16, ptr addrspace(1) %out
-; ret void
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; %val = load bfloat, ptr addrspace(1) %in
-; %val.f32 = fpext bfloat %val to float
-; store float %val.f32, ptr addrspace(1) %out
-; ret void
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; %val = load bfloat, ptr addrspace(1) %in
-; %val.f64 = fpext bfloat %val to double
-; store double %val.f64, ptr addrspace(1) %out
-; ret void
-; }
-
-define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dword v[2:3], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[2:3], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load <2 x bfloat>, ptr addrspace(1) %in
- store <2 x bfloat> %val, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load <4 x bfloat>, ptr addrspace(1) %in
- store <4 x bfloat> %val, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load <8 x bfloat>, ptr addrspace(1) %in
- store <8 x bfloat> %val, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_load_store_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_load_store_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_load_store_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: global_store_dwordx4 v[2:3], v[8:11], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_load_store_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: global_store_dwordx4 v[2:3], v[4:7], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dwordx4 v[2:3], v[8:11], off offset:16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load <16 x bfloat>, ptr addrspace(1) %in
- store <16 x bfloat> %val, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_store_short v[1:2], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_short v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_store_short v[1:2], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store bfloat %in, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_store_dword v[1:2], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[1:2], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v[1:2], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <2 x bfloat> %in, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
-; GCN-LABEL: test_arg_store_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_short v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_store_short v[2:3], v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_store_short v[4:5], v6
-; GFX8-NEXT: flat_store_short v[2:3], v1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_short v[2:3], v0, off
-; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_store_short v[2:3], v0, off
-; GFX10-NEXT: global_store_short_d16_hi v[2:3], v0, off offset:2
-; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
- ret void
-}
-
-define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <4 x bfloat> %in, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <8 x bfloat> %in, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: v_or_b32_e32 v4, v9, v8
-; GCN-NEXT: v_or_b32_e32 v5, v11, v10
-; GCN-NEXT: v_or_b32_e32 v6, v13, v12
-; GCN-NEXT: v_or_b32_e32 v7, v15, v14
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store <16 x bfloat> %in, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_inreg_arg_store:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: s_mov_b32 s38, 0
-; GCN-NEXT: s_mov_b32 s39, 0xf000
-; GCN-NEXT: s_mov_b64 s[36:37], 0
-; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_inreg_arg_store:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: s_mov_b32 s38, 0
-; GFX7-NEXT: s_mov_b32 s39, 0xf000
-; GFX7-NEXT: s_mov_b64 s[36:37], 0
-; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_inreg_arg_store:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_inreg_arg_store:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: global_store_short v[0:1], v2, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_inreg_arg_store:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: global_store_short v[0:1], v2, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store bfloat %in, ptr addrspace(1) %out
- ret void
-}
-
-define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
-; GCN-LABEL: test_byval:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_byval:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_byval:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_byval:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_byval:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store bfloat %val, ptr addrspace(5) %bv
- %retval = load bfloat, ptr addrspace(5) %bv
- ret bfloat %retval
-}
-
-define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
-; GCN-LABEL: test_sret:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_sret:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_sret:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_sret:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_sret:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- store bfloat %val, ptr addrspace(5) %sret
- ret void
-}
-
-define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_bitcast_from_bfloat:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_bitcast_from_bfloat:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_bitcast_from_bfloat:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_short v[2:3], v0
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_bitcast_from_bfloat:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v[2:3], v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_bitcast_from_bfloat:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v[2:3], v0, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load bfloat, ptr addrspace(1) %in
- %val_int = bitcast bfloat %val to i16
- store i16 %val_int, ptr addrspace(1) %out
- ret void
-}
-
-define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-LABEL: test_bitcast_to_bfloat:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_bitcast_to_bfloat:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_bitcast_to_bfloat:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_bitcast_to_bfloat:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v[0:1], v2, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_bitcast_to_bfloat:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v[2:3], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v[0:1], v2, off
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %val = load i16, ptr addrspace(1) %in
- %val_fp = bitcast i16 %val to bfloat
- store bfloat %val_fp, ptr addrspace(1) %out
- ret void
-}
-
-define bfloat @test_ret(bfloat %in) {
-; GCN-LABEL: test_ret:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret bfloat %in
-}
-
-define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
-; GCN-LABEL: test_ret_v2bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v2bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v2bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v2bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v2bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret <2 x bfloat> %in
-}
-
-define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
-; GCN-LABEL: test_ret_v3bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v3bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v3bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v3bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v3bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret <3 x bfloat> %in
-}
-
-define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
-; GCN-LABEL: test_ret_v4bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v4bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v4bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v4bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v4bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret <4 x bfloat> %in
-}
-
-define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
-; GCN-LABEL: test_ret_v8bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v8bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v8bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v8bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v8bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret <8 x bfloat> %in
-}
-
-define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
-; GCN-LABEL: test_ret_v16bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_ret_v16bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_ret_v16bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v8
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_ret_v16bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v16bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v6, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- ret <16 x bfloat> %in
-}
-
-define void @test_call(bfloat %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v2, s30, 0
-; GCN-NEXT: v_writelane_b32 v2, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v2, 1
-; GCN-NEXT: v_readlane_b32 s30, v2, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v2, s30, 0
-; GFX7-NEXT: v_writelane_b32 v2, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v2, 1
-; GFX7-NEXT: v_readlane_b32 s30, v2, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_writelane_b32 v2, s30, 0
-; GFX8-NEXT: v_writelane_b32 v2, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: v_writelane_b32 v2, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v2, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_writelane_b32 v2, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call bfloat @test_arg_store(bfloat %in)
- store volatile bfloat %result, ptr addrspace(5) %out
- ret void
-}
-
-define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v2bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v3, s30, 0
-; GCN-NEXT: v_writelane_b32 v3, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v3, 1
-; GCN-NEXT: v_readlane_b32 s30, v3, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v2bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v3, s30, 0
-; GFX7-NEXT: v_writelane_b32 v3, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v3, 1
-; GFX7-NEXT: v_readlane_b32 s30, v3, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v2bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_writelane_b32 v2, s30, 0
-; GFX8-NEXT: v_writelane_b32 v2, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v2, 1
-; GFX8-NEXT: v_readlane_b32 s30, v2, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v2, s30, 0
-; GFX9-NEXT: v_writelane_b32 v2, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v2, 1
-; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v2bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v2, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_writelane_b32 v2, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_readlane_b32 s31, v2, 1
-; GFX10-NEXT: v_readlane_b32 s30, v2, 0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
- store volatile <2 x bfloat> %result, ptr addrspace(5) %out
- ret void
-}
-
-define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v3bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v4, s30, 0
-; GCN-NEXT: v_writelane_b32 v4, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v2, v5, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v4, 1
-; GCN-NEXT: v_readlane_b32 s30, v4, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v3bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v4, s30, 0
-; GFX7-NEXT: v_writelane_b32 v4, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3
-; GFX7-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v4, 1
-; GFX7-NEXT: v_readlane_b32 s30, v4, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v3bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_writelane_b32 v3, s30, 0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_writelane_b32 v3, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v3, 1
-; GFX8-NEXT: v_readlane_b32 s30, v3, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v3bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
- store volatile <3 x bfloat> %result, ptr addrspace(5) %out
- ret void
-}
-
-define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v4bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v5, s30, 0
-; GCN-NEXT: v_writelane_b32 v5, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 4, v4
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v5, 1
-; GCN-NEXT: v_readlane_b32 s30, v5, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v4bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v5, s30, 0
-; GFX7-NEXT: v_writelane_b32 v5, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4
-; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v5, 1
-; GFX7-NEXT: v_readlane_b32 s30, v5, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v4bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_writelane_b32 v3, s30, 0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_writelane_b32 v3, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v3, 1
-; GFX8-NEXT: v_readlane_b32 s30, v3, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v4bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v3, s30, 0
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_writelane_b32 v3, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_readlane_b32 s31, v3, 1
-; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
- store volatile <4 x bfloat> %result, ptr addrspace(5) %out
- ret void
-}
-
-define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v8bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v9, s30, 0
-; GCN-NEXT: v_writelane_b32 v9, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v8
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v8
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v8
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v2, v11, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, v12, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v9, 1
-; GCN-NEXT: v_readlane_b32 s30, v9, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v8bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v9, s30, 0
-; GFX7-NEXT: v_writelane_b32 v9, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v8
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v8
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v8
-; GFX7-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v9, 1
-; GFX7-NEXT: v_readlane_b32 s30, v9, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v8bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v2, v1
-; GFX8-NEXT: v_writelane_b32 v5, s30, 0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_writelane_b32 v5, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4
-; GFX8-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4
-; GFX8-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v5, 1
-; GFX8-NEXT: v_readlane_b32 s30, v5, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-NEXT: v_writelane_b32 v5, s30, 0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_writelane_b32 v5, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v5, 1
-; GFX9-NEXT: v_readlane_b32 s30, v5, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v8bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_writelane_b32 v5, s30, 0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_writelane_b32 v5, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT: v_readlane_b32 s31, v5, 1
-; GFX10-NEXT: v_mov_b32_sdwa v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_readlane_b32 s30, v5, 0
-; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
- store volatile <8 x bfloat> %result, ptr addrspace(5) %out
- ret void
-}
-
-define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v16bf16:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0x400
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v17, s30, 0
-; GCN-NEXT: v_writelane_b32 v17, s31, 1
-; GCN-NEXT: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT: v_add_i32_e32 v18, vcc, 4, v16
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 8, v16
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 12, v16
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v16
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 20, v16
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 24, v16
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 28, v16
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v3, v2
-; GCN-NEXT: v_or_b32_e32 v2, v5, v4
-; GCN-NEXT: v_or_b32_e32 v3, v7, v6
-; GCN-NEXT: v_or_b32_e32 v4, v9, v8
-; GCN-NEXT: v_or_b32_e32 v5, v11, v10
-; GCN-NEXT: v_or_b32_e32 v6, v13, v12
-; GCN-NEXT: v_or_b32_e32 v7, v15, v14
-; GCN-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v5, v22, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v6, v23, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v17, 1
-; GCN-NEXT: v_readlane_b32 s30, v17, 0
-; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
-; GCN-NEXT: s_mov_b32 s33, s8
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_call_v16bf16:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, s33
-; GFX7-NEXT: s_mov_b32 s33, s32
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0x400
-; GFX7-NEXT: s_getpc_b64 s[4:5]
-; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT: v_writelane_b32 v17, s30, 0
-; GFX7-NEXT: v_writelane_b32 v17, s31, 1
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX7-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v16
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v8
-; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v11
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; GFX7-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v16
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v12
-; GFX7-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 20, v16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v15
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v14
-; GFX7-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 24, v16
-; GFX7-NEXT: v_or_b32_e32 v7, v7, v8
-; GFX7-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 28, v16
-; GFX7-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_readlane_b32 s31, v17, 1
-; GFX7-NEXT: v_readlane_b32 s30, v17, 0
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
-; GFX7-NEXT: s_mov_b32 s33, s8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_call_v16bf16:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s6, s33
-; GFX8-NEXT: s_mov_b32 s33, s32
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0x400
-; GFX8-NEXT: s_getpc_b64 s[4:5]
-; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v10, v2
-; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_writelane_b32 v9, s30, 0
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v10
-; GFX8-NEXT: v_writelane_b32 v9, s31, 1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v8
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 24, v8
-; GFX8-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX8-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 28, v8
-; GFX8-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readlane_b32 s31, v9, 1
-; GFX8-NEXT: v_readlane_b32 s30, v9, 0
-; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
-; GFX8-NEXT: s_mov_b32 s33, s6
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-NEXT: v_mov_b32_e32 v10, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, v3
-; GFX9-NEXT: v_writelane_b32 v9, s30, 0
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-NEXT: v_writelane_b32 v9, s31, 1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX9-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readlane_b32 s31, v9, 1
-; GFX9-NEXT: v_readlane_b32 s30, v9, 0
-; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: s_mov_b32 s33, s6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_call_v16bf16:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s6, s33
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16 at gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v10, v2
-; GFX10-NEXT: v_mov_b32_e32 v6, v3
-; GFX10-NEXT: v_writelane_b32 v9, s30, 0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v10
-; GFX10-NEXT: v_writelane_b32 v9, s31, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v7
-; GFX10-NEXT: v_mov_b32_sdwa v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_readlane_b32 s31, v9, 1
-; GFX10-NEXT: v_readlane_b32 s30, v9, 0
-; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: s_mov_b32 s33, s6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
- store volatile <16 x bfloat> %result, ptr addrspace(5) %out
- ret void
-}
-
-define bfloat @test_alloca_load_store_ret(bfloat %in) {
-; GCN-LABEL: test_alloca_load_store_ret:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_alloca_load_store_ret:
-; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_alloca_load_store_ret:
-; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_alloca_load_store_ret:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc dlc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-entry:
- %in.addr = alloca bfloat, align 2, addrspace(5)
- store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
- %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
- ret bfloat %loaded
-}
-
-define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
-; GCN-LABEL: test_overflow_stack:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0
-; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 20, v0
-; GCN-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 28, v0
-; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 32, v0
-; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 36, v0
-; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32
-; GCN-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; GCN-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 52, v0
-; GCN-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 56, v0
-; GCN-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 60, v0
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 64, v0
-; GCN-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v5, 0x44
-; GCN-NEXT: v_mov_b32_e32 v7, 0x48
-; GCN-NEXT: buffer_store_dword v18, v6, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v6, 0x4c
-; GCN-NEXT: v_mov_b32_e32 v8, 0x50
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v5, 0x54
-; GCN-NEXT: v_mov_b32_e32 v9, 0x58
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v7
-; GCN-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v7, 0x5c
-; GCN-NEXT: v_mov_b32_e32 v10, 0x60
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v0, v6
-; GCN-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v6, 0x64
-; GCN-NEXT: v_mov_b32_e32 v11, 0x68
-; GCN-NEXT: v_add_i32_e32 v8, vcc, v0, v8
-; GCN-NEXT: buffer_store_dword v22, v8, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v8, 0x6c
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x70, v0
-; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GCN-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x74, v0
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x78, v0
-; GCN-NEXT: v_add_i32_e32 v9, vcc, v0, v9
-; GCN-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x7c, v0
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x80, v0
-; GCN-NEXT: v_add_i32_e32 v7, vcc, v0, v7
-; GCN-NEXT: v_add_i32_e32 v10, vcc, v0, v10
-; GCN-NEXT: v_add_i32_e32 v6, vcc, v0, v6
-; GCN-NEXT: v_add_i32_e32 v11, vcc, v0, v11
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v27, v6, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_overflow_stack:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
-; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 20, v0
-; GFX7-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 24, v0
-; GFX7-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 28, v0
-; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 32, v0
-; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 36, v0
-; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; GFX7-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; GFX7-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 52, v0
-; GFX7-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 56, v0
-; GFX7-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 64, v0
-; GFX7-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x44
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x48
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x4c
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x54
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x58
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x60
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x64
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x68
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x6c
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GFX7-NEXT: buffer_store_dword v29, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0
-; GFX7-NEXT: buffer_store_dword v30, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x74, v0
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
-; GFX7-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
-; GFX7-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_overflow_stack:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
-; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
-; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v0
-; GFX8-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v0
-; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 24, v0
-; GFX8-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 28, v0
-; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 36, v0
-; GFX8-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0
-; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 44, v0
-; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 48, v0
-; GFX8-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0
-; GFX8-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x44
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x48
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x4c
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x54
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x58
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x5c
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x60
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x64
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x68
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x6c
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v5
-; GFX8-NEXT: buffer_store_dword v29, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0
-; GFX8-NEXT: buffer_store_dword v30, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x74, v0
-; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0
-; GFX8-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7c, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
-; GFX8-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_overflow_stack:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32
-; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(20)
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: s_waitcnt vmcnt(20)
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: s_waitcnt vmcnt(20)
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_overflow_stack:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
- %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
- ret { <32 x i32>, bfloat } %ins.1
-}
-
-; FIXME: unable to translate instruction: fpext
-; define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
-; %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <2 x bfloat> %load to <2 x float>
-; ret <2 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
-; %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <3 x bfloat> %load to <3 x float>
-; ret <3 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
-; %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <4 x bfloat> %load to <4 x float>
-; ret <4 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
-; %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <5 x bfloat> %load to <5 x float>
-; ret <5 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
-; %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <6 x bfloat> %load to <6 x float>
-; ret <6 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
-; %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <8 x bfloat> %load to <8 x float>
-; ret <8 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
-; %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <16 x bfloat> %load to <16 x float>
-; ret <16 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
-; %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <32 x bfloat> %load to <32 x float>
-; ret <32 x float> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
-; %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <2 x bfloat> %load to <2 x double>
-; ret <2 x double> %fpext
-; }
-
-; define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
-; %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <3 x bfloat> %load to <3 x double>
-; ret <3 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
-; %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <4 x bfloat> %load to <4 x double>
-; ret <4 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
-; %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <5 x bfloat> %load to <5 x double>
-; ret <5 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
-; %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <6 x bfloat> %load to <6 x double>
-; ret <6 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
-; %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <8 x bfloat> %load to <8 x double>
-; ret <8 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
-; %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <16 x bfloat> %load to <16 x double>
-; ret <16 x double> %fpext
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
-; %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-; %fpext = fpext <32 x bfloat> %load to <32 x double>
-; ret <32 x double> %fpext
-; }
-
-define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fadd_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd bfloat %a, %b
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_add_f32_e32 v0, v0, v2
-; GCN-NEXT: v_add_f32_e32 v1, v1, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v2, v0, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <2 x bfloat> %a, %b
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_add_f32_e32 v0, v0, v3
-; GCN-NEXT: v_add_f32_e32 v1, v1, v4
-; GCN-NEXT: v_add_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <3 x bfloat> %a, %b
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_add_f32_e32 v0, v0, v4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v5
-; GCN-NEXT: v_add_f32_e32 v2, v2, v6
-; GCN-NEXT: v_add_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_add_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <4 x bfloat> %a, %b
- ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: v_add_f32_e32 v0, v0, v8
-; GCN-NEXT: v_add_f32_e32 v1, v1, v9
-; GCN-NEXT: v_add_f32_e32 v2, v2, v10
-; GCN-NEXT: v_add_f32_e32 v3, v3, v11
-; GCN-NEXT: v_add_f32_e32 v4, v4, v12
-; GCN-NEXT: v_add_f32_e32 v5, v5, v13
-; GCN-NEXT: v_add_f32_e32 v6, v6, v14
-; GCN-NEXT: v_add_f32_e32 v7, v7, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v6, v0, v4
-; GFX8-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v2, v1, v5
-; GFX8-NEXT: v_add_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT: v_pk_add_f16 v2, v1, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v4
-; GFX10-NEXT: v_pk_add_f16 v2, v1, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <8 x bfloat> %a, %b
- ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_add_f32_e32 v0, v0, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT: v_add_f32_e32 v1, v1, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT: v_add_f32_e32 v2, v2, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT: v_add_f32_e32 v3, v3, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT: v_add_f32_e32 v4, v4, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT: v_add_f32_e32 v5, v5, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT: v_add_f32_e32 v6, v6, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT: v_add_f32_e32 v7, v7, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT: v_add_f32_e32 v8, v8, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT: v_add_f32_e32 v9, v9, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT: v_add_f32_e32 v10, v10, v16
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT: v_add_f32_e32 v11, v11, v17
-; GCN-NEXT: v_add_f32_e32 v12, v12, v18
-; GCN-NEXT: v_add_f32_e32 v13, v13, v19
-; GCN-NEXT: v_add_f32_e32 v14, v14, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_add_f32_e32 v15, v15, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT: v_add_f32_e32 v8, v8, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT: v_add_f32_e32 v9, v9, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT: v_add_f32_e32 v10, v10, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT: v_add_f32_e32 v12, v12, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_add_f32_e32 v13, v13, v18
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v14, v14, v17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_add_f32_e32 v15, v15, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v12, v0, v8
-; GFX8-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v13, v1, v9
-; GFX8-NEXT: v_add_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v2, v10
-; GFX8-NEXT: v_add_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v6, v3, v11
-; GFX8-NEXT: v_add_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v12
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX9-NEXT: v_pk_add_f16 v8, v1, v9
-; GFX9-NEXT: v_pk_add_f16 v4, v2, v10
-; GFX9-NEXT: v_pk_add_f16 v6, v3, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX10-NEXT: v_pk_add_f16 v8, v1, v9
-; GFX10-NEXT: v_pk_add_f16 v4, v2, v10
-; GFX10-NEXT: v_pk_add_f16 v6, v3, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <16 x bfloat> %a, %b
- ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_add_f32_e32 v0, v0, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_add_f32_e32 v1, v1, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v2, v2, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_add_f32_e32 v3, v3, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v4, v4, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_add_f32_e32 v5, v5, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v6, v6, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_add_f32_e32 v7, v7, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v8, v8, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT: v_add_f32_e32 v9, v9, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v10, v10, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_add_f32_e32 v11, v11, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v12, v12, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: v_add_f32_e32 v13, v13, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v14, v14, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_add_f32_e32 v15, v15, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v16, v16, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_add_f32_e32 v17, v17, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v18, v18, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_add_f32_e32 v19, v19, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v20, v20, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT: v_add_f32_e32 v21, v21, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v22, v22, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_add_f32_e32 v23, v23, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v24, v24, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_add_f32_e32 v25, v25, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v26, v26, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_add_f32_e32 v27, v27, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_add_f32_e32 v28, v28, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_add_f32_e32 v29, v29, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_add_f32_e32 v30, v30, v31
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT: v_add_f32_e32 v31, v31, v32
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v8, v8, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v9, v9, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v10, v10, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v12, v12, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v13, v13, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v14, v14, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v15, v15, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v16, v16, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v17, v17, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v18, v18, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v19, v19, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v20, v20, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v21, v21, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v22, v22, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v24, v24, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v25, v25, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v26, v26, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v27, v27, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v28, v28, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v29, v29, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v30, v30, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_add_f32_e32 v31, v31, v32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v24, v0, v16
-; GFX8-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v25, v1, v17
-; GFX8-NEXT: v_add_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v26, v2, v18
-; GFX8-NEXT: v_add_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v27, v3, v19
-; GFX8-NEXT: v_add_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v8, v4, v20
-; GFX8-NEXT: v_add_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v10, v5, v21
-; GFX8-NEXT: v_add_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v12, v6, v22
-; GFX8-NEXT: v_add_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v14, v7, v23
-; GFX8-NEXT: v_add_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, v16
-; GFX8-NEXT: v_mov_b32_e32 v2, v25
-; GFX8-NEXT: v_mov_b32_e32 v3, v17
-; GFX8-NEXT: v_mov_b32_e32 v4, v26
-; GFX8-NEXT: v_mov_b32_e32 v5, v18
-; GFX8-NEXT: v_mov_b32_e32 v6, v27
-; GFX8-NEXT: v_mov_b32_e32 v7, v19
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v16
-; GFX9-NEXT: v_pk_add_f16 v16, v1, v17
-; GFX9-NEXT: v_pk_add_f16 v18, v2, v18
-; GFX9-NEXT: v_pk_add_f16 v17, v3, v19
-; GFX9-NEXT: v_pk_add_f16 v8, v4, v20
-; GFX9-NEXT: v_pk_add_f16 v10, v5, v21
-; GFX9-NEXT: v_pk_add_f16 v12, v6, v22
-; GFX9-NEXT: v_pk_add_f16 v14, v7, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_mov_b32_e32 v2, v16
-; GFX9-NEXT: v_mov_b32_e32 v4, v18
-; GFX9-NEXT: v_mov_b32_e32 v6, v17
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v16
-; GFX10-NEXT: v_pk_add_f16 v16, v1, v17
-; GFX10-NEXT: v_pk_add_f16 v18, v2, v18
-; GFX10-NEXT: v_pk_add_f16 v17, v3, v19
-; GFX10-NEXT: v_pk_add_f16 v8, v4, v20
-; GFX10-NEXT: v_pk_add_f16 v10, v5, v21
-; GFX10-NEXT: v_pk_add_f16 v12, v6, v22
-; GFX10-NEXT: v_pk_add_f16 v14, v7, v23
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT: v_mov_b32_e32 v2, v16
-; GFX10-NEXT: v_mov_b32_e32 v4, v18
-; GFX10-NEXT: v_mov_b32_e32 v6, v17
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fadd <32 x bfloat> %a, %b
- ret <32 x bfloat> %op
-}
-
-define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
-; GCN-LABEL: v_fadd_bf16_fpimm_0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, 0x3f80
-; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16_fpimm_0:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, 0x3f80
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16_fpimm_0:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v0, 0x3f80, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16_fpimm_0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_f16_e32 v0, 0x3f80, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16_fpimm_0:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f16_e32 v0, 0x3f80, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %add = fadd bfloat %arg0, 1.0
- ret bfloat %add
-}
-
-define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
-; GCN-LABEL: v_fadd_bf16_fpimm_1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, 0x4228
-; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fadd_bf16_fpimm_1:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, 0x4228
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fadd_bf16_fpimm_1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v0, 0x4228, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fadd_bf16_fpimm_1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_f16_e32 v0, 0x4228, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fadd_bf16_fpimm_1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f16_e32 v0, 0x4228, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %add = fadd bfloat %arg0, 42.0
- ret bfloat %add
-}
-
-define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fsub_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e64 v1, -v1
-; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e64 v1, -v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fsub bfloat %a, %b
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_or_b32_e32 v2, v3, v2
-; GCN-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_add_f32_e32 v0, v0, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_add_f32_e32 v1, v1, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX8-NEXT: v_add_f16_e32 v2, v0, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fsub <2 x bfloat> %a, %b
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e64 v3, -v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e64 v4, -v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v5
-; GCN-NEXT: v_add_f32_e32 v0, v0, v3
-; GCN-NEXT: v_add_f32_e32 v1, v1, v4
-; GCN-NEXT: v_add_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v4
-; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX9-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX10-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fsub <3 x bfloat> %a, %b
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e64 v4, -v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e64 v6, -v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e64 v7, -v7
-; GCN-NEXT: v_add_f32_e32 v0, v0, v4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v5
-; GCN-NEXT: v_add_f32_e32 v2, v2, v6
-; GCN-NEXT: v_add_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fsub_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e64 v5, -v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e64 v4, -v6
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e64 v5, -v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fsub_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fsub_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX9-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fsub_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_f16_e32 v3, v0, v2
-; GFX10-NEXT: v_sub_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fsub <4 x bfloat> %a, %b
- ret <4 x bfloat> %op
-}
-
-define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fmul_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul bfloat %a, %b
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <2 x bfloat> %a, %b
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <3 x bfloat> %a, %b
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v3, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <4 x bfloat> %a, %b
- ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT: v_mul_f32_e32 v4, v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT: v_mul_f32_e32 v5, v5, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v7, v7, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v6, v0, v4
-; GFX8-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v1, v5
-; GFX8-NEXT: v_mul_f16_sdwa v3, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
-; GFX9-NEXT: v_pk_mul_f16 v2, v1, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4
-; GFX10-NEXT: v_pk_mul_f16 v2, v1, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <8 x bfloat> %a, %b
- ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v16
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT: v_mul_f32_e32 v11, v11, v17
-; GCN-NEXT: v_mul_f32_e32 v12, v12, v18
-; GCN-NEXT: v_mul_f32_e32 v13, v13, v19
-; GCN-NEXT: v_mul_f32_e32 v14, v14, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, v4, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT: v_mul_f32_e32 v5, v5, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT: v_mul_f32_e32 v7, v7, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT: v_mul_f32_e32 v8, v8, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT: v_mul_f32_e32 v9, v9, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT: v_mul_f32_e32 v10, v10, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT: v_mul_f32_e32 v12, v12, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_mul_f32_e32 v13, v13, v18
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v14, v14, v17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_mul_f32_e32 v15, v15, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v12, v0, v8
-; GFX8-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v13, v1, v9
-; GFX8-NEXT: v_mul_f16_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v4, v2, v10
-; GFX8-NEXT: v_mul_f16_sdwa v5, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v6, v3, v11
-; GFX8-NEXT: v_mul_f16_sdwa v7, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v12
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v8
-; GFX9-NEXT: v_pk_mul_f16 v8, v1, v9
-; GFX9-NEXT: v_pk_mul_f16 v4, v2, v10
-; GFX9-NEXT: v_pk_mul_f16 v6, v3, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v8
-; GFX10-NEXT: v_pk_mul_f16 v8, v1, v9
-; GFX10-NEXT: v_pk_mul_f16 v4, v2, v10
-; GFX10-NEXT: v_pk_mul_f16 v6, v3, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <16 x bfloat> %a, %b
- ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_mul_f32_e32 v11, v11, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: v_mul_f32_e32 v13, v13, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_mul_f32_e32 v15, v15, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_mul_f32_e32 v17, v17, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_mul_f32_e32 v19, v19, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT: v_mul_f32_e32 v21, v21, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_mul_f32_e32 v23, v23, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_mul_f32_e32 v25, v25, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_mul_f32_e32 v27, v27, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_mul_f32_e32 v29, v29, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_mul_f32_e32 v30, v30, v31
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT: v_mul_f32_e32 v31, v31, v32
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmul_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v4, v4, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v5, v5, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v7, v7, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v8, v8, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v9, v9, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v10, v10, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v12, v12, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v13, v13, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v14, v14, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v15, v15, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v16, v16, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v17, v17, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v18, v18, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v19, v19, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v20, v20, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v21, v21, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v22, v22, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v24, v24, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v25, v25, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v26, v26, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v27, v27, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v28, v28, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v29, v29, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v30, v30, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_mul_f32_e32 v31, v31, v32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v24, v0, v16
-; GFX8-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v25, v1, v17
-; GFX8-NEXT: v_mul_f16_sdwa v17, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v26, v2, v18
-; GFX8-NEXT: v_mul_f16_sdwa v18, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v27, v3, v19
-; GFX8-NEXT: v_mul_f16_sdwa v19, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v8, v4, v20
-; GFX8-NEXT: v_mul_f16_sdwa v9, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v10, v5, v21
-; GFX8-NEXT: v_mul_f16_sdwa v11, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v12, v6, v22
-; GFX8-NEXT: v_mul_f16_sdwa v13, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v14, v7, v23
-; GFX8-NEXT: v_mul_f16_sdwa v15, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v0, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, v16
-; GFX8-NEXT: v_mov_b32_e32 v2, v25
-; GFX8-NEXT: v_mov_b32_e32 v3, v17
-; GFX8-NEXT: v_mov_b32_e32 v4, v26
-; GFX8-NEXT: v_mov_b32_e32 v5, v18
-; GFX8-NEXT: v_mov_b32_e32 v6, v27
-; GFX8-NEXT: v_mov_b32_e32 v7, v19
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmul_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v16
-; GFX9-NEXT: v_pk_mul_f16 v16, v1, v17
-; GFX9-NEXT: v_pk_mul_f16 v18, v2, v18
-; GFX9-NEXT: v_pk_mul_f16 v17, v3, v19
-; GFX9-NEXT: v_pk_mul_f16 v8, v4, v20
-; GFX9-NEXT: v_pk_mul_f16 v10, v5, v21
-; GFX9-NEXT: v_pk_mul_f16 v12, v6, v22
-; GFX9-NEXT: v_pk_mul_f16 v14, v7, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_mov_b32_e32 v2, v16
-; GFX9-NEXT: v_mov_b32_e32 v4, v18
-; GFX9-NEXT: v_mov_b32_e32 v6, v17
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v16
-; GFX10-NEXT: v_pk_mul_f16 v16, v1, v17
-; GFX10-NEXT: v_pk_mul_f16 v18, v2, v18
-; GFX10-NEXT: v_pk_mul_f16 v17, v3, v19
-; GFX10-NEXT: v_pk_mul_f16 v8, v4, v20
-; GFX10-NEXT: v_pk_mul_f16 v10, v5, v21
-; GFX10-NEXT: v_pk_mul_f16 v12, v6, v22
-; GFX10-NEXT: v_pk_mul_f16 v14, v7, v23
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT: v_mov_b32_e32 v2, v16
-; GFX10-NEXT: v_mov_b32_e32 v4, v18
-; GFX10-NEXT: v_mov_b32_e32 v6, v17
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fmul <32 x bfloat> %a, %b
- ret <32 x bfloat> %op
-}
-
-define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fdiv_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GCN-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; GCN-NEXT: v_rcp_f32_e32 v4, v2
-; GCN-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; GCN-NEXT: v_fma_f32 v4, v5, v4, v4
-; GCN-NEXT: v_mul_f32_e32 v5, v3, v4
-; GCN-NEXT: v_fma_f32 v6, -v2, v5, v3
-; GCN-NEXT: v_fma_f32 v5, v6, v4, v5
-; GCN-NEXT: v_fma_f32 v2, -v2, v5, v3
-; GCN-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fdiv_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX7-NEXT: v_rcp_f32_e32 v3, v2
-; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fdiv_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fdiv bfloat %a, %b
- ret bfloat %op
-}
-
-declare bfloat @llvm.fabs.bf16(bfloat)
-
-define bfloat @v_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fabs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fabs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fabs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fabs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fabs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.fabs.bf16(bfloat %a)
- ret bfloat %op
-}
-
-define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fabs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_fabs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fabs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fabs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fabs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %op = call bfloat @llvm.fabs.bf16(bfloat %a)
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-define bfloat @v_fneg_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fneg_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fneg_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fneg_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fneg_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fneg bfloat %a
- ret bfloat %op
-}
-
-declare i32 @llvm.amdgcn.readfirstlane(i32)
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_xor_b32 s0, s0, 0x8000
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_xor_b32 s0, s0, 0x8000
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %op = fneg bfloat %a
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-define bfloat @v_fneg_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_fabs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fneg_fabs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fneg_fabs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fneg_fabs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fneg_fabs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
- %op = fneg bfloat %fabs
- ret bfloat %op
-}
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_fabs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_bitset1_b32 s0, 15
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_fabs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_bitset1_b32 s0, 15
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_fabs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_bitset1_b32 s0, 15
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_fabs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_bitset1_b32 s0, 15
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_fabs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_bitset1_b32 s0, 15
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
- %op = fneg bfloat %fabs
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
-declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
-declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
-declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
-
-define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_minnum_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_min_f32_e32 v0, v0, v2
-; GCN-NEXT: v_min_f32_e32 v1, v1, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v2, v2, v3
-; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_min_f32_e32 v0, v0, v3
-; GCN-NEXT: v_min_f32_e32 v1, v1, v4
-; GCN-NEXT: v_min_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_min_f16_e32 v3, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_min_f32_e32 v0, v0, v4
-; GCN-NEXT: v_min_f32_e32 v1, v1, v5
-; GCN-NEXT: v_min_f32_e32 v2, v2, v6
-; GCN-NEXT: v_min_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_min_f16_e32 v3, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
- ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: v_min_f32_e32 v0, v0, v8
-; GCN-NEXT: v_min_f32_e32 v1, v1, v9
-; GCN-NEXT: v_min_f32_e32 v2, v2, v10
-; GCN-NEXT: v_min_f32_e32 v3, v3, v11
-; GCN-NEXT: v_min_f32_e32 v4, v4, v12
-; GCN-NEXT: v_min_f32_e32 v5, v5, v13
-; GCN-NEXT: v_min_f32_e32 v6, v6, v14
-; GCN-NEXT: v_min_f32_e32 v7, v7, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT: v_min_f16_e32 v6, v2, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v4, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v2, v5, v5
-; GFX8-NEXT: v_min_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX9-NEXT: v_pk_min_f16 v2, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_min_f16 v2, v1, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
- ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_min_f32_e32 v0, v0, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT: v_min_f32_e32 v1, v1, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT: v_min_f32_e32 v2, v2, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT: v_min_f32_e32 v3, v3, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT: v_min_f32_e32 v4, v4, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT: v_min_f32_e32 v5, v5, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT: v_min_f32_e32 v6, v6, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT: v_min_f32_e32 v7, v7, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT: v_min_f32_e32 v8, v8, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT: v_min_f32_e32 v9, v9, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT: v_min_f32_e32 v10, v10, v16
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT: v_min_f32_e32 v11, v11, v17
-; GCN-NEXT: v_min_f32_e32 v12, v12, v18
-; GCN-NEXT: v_min_f32_e32 v13, v13, v19
-; GCN-NEXT: v_min_f32_e32 v14, v14, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_min_f32_e32 v15, v15, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT: v_min_f32_e32 v8, v8, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT: v_min_f32_e32 v10, v10, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT: v_min_f32_e32 v12, v12, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_min_f32_e32 v13, v13, v18
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_min_f32_e32 v14, v14, v17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v8, v8
-; GFX8-NEXT: v_min_f16_e32 v12, v4, v5
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v8, v0, v4
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v4, v9, v9
-; GFX8-NEXT: v_min_f16_e32 v13, v0, v4
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v9, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v1, v10, v10
-; GFX8-NEXT: v_min_f16_e32 v4, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v5, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v11, v11
-; GFX8-NEXT: v_min_f16_e32 v6, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v12
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v4, v8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v4
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v4, v9, v9
-; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v8, v1, v4
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v2, v10, v10
-; GFX9-NEXT: v_pk_min_f16 v4, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT: v_pk_max_f16 v2, v11, v11
-; GFX9-NEXT: v_pk_min_f16 v6, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v4, v8, v8
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v5, v9, v9
-; GFX10-NEXT: v_pk_max_f16 v6, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v7, v10, v10
-; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v4
-; GFX10-NEXT: v_pk_min_f16 v2, v1, v5
-; GFX10-NEXT: v_pk_min_f16 v4, v6, v7
-; GFX10-NEXT: v_pk_min_f16 v6, v3, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
- ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_min_f32_e32 v0, v0, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_min_f32_e32 v1, v1, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v2, v2, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_min_f32_e32 v3, v3, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v4, v4, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_min_f32_e32 v5, v5, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v6, v6, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_min_f32_e32 v7, v7, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v8, v8, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT: v_min_f32_e32 v9, v9, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v10, v10, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_min_f32_e32 v11, v11, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v12, v12, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: v_min_f32_e32 v13, v13, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v14, v14, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_min_f32_e32 v15, v15, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v16, v16, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_min_f32_e32 v17, v17, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v18, v18, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_min_f32_e32 v19, v19, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v20, v20, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT: v_min_f32_e32 v21, v21, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v22, v22, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_min_f32_e32 v23, v23, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v24, v24, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_min_f32_e32 v25, v25, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v26, v26, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_min_f32_e32 v27, v27, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_min_f32_e32 v28, v28, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_min_f32_e32 v29, v29, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_min_f32_e32 v30, v30, v31
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT: v_min_f32_e32 v31, v31, v32
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_minnum_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v1, v1, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v7, v7, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v8, v8, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v10, v10, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v12, v12, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v13, v13, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v14, v14, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v16, v16, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v17, v17, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v18, v18, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v19, v19, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v20, v20, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v21, v21, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v22, v22, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v24, v24, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v25, v25, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v26, v26, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v27, v27, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v28, v28, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v29, v29, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v30, v30, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minnum_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v8, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v9, v16, v16
-; GFX8-NEXT: v_min_f16_e32 v24, v8, v9
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v16, v0, v8
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v8, v17, v17
-; GFX8-NEXT: v_min_f16_e32 v25, v0, v8
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v17, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v1, v18, v18
-; GFX8-NEXT: v_min_f16_e32 v26, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v18, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v19, v19
-; GFX8-NEXT: v_min_f16_e32 v27, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v19, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v4, v4
-; GFX8-NEXT: v_max_f16_e32 v1, v20, v20
-; GFX8-NEXT: v_min_f16_e32 v8, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v9, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v1, v21, v21
-; GFX8-NEXT: v_min_f16_e32 v10, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v11, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v1, v22, v22
-; GFX8-NEXT: v_min_f16_e32 v12, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v13, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v1, v23, v23
-; GFX8-NEXT: v_min_f16_e32 v14, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_min_f16_e32 v15, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, v16
-; GFX8-NEXT: v_mov_b32_e32 v2, v25
-; GFX8-NEXT: v_mov_b32_e32 v3, v17
-; GFX8-NEXT: v_mov_b32_e32 v4, v26
-; GFX8-NEXT: v_mov_b32_e32 v5, v18
-; GFX8-NEXT: v_mov_b32_e32 v6, v27
-; GFX8-NEXT: v_mov_b32_e32 v7, v19
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_minnum_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v8, v16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v0, v0, v8
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v8, v17, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v16, v1, v8
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v2, v18, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v18, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT: v_pk_max_f16 v2, v19, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v17, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX9-NEXT: v_pk_max_f16 v2, v20, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v8, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v5, v5
-; GFX9-NEXT: v_pk_max_f16 v2, v21, v21
-; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_min_f16 v10, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v6, v6
-; GFX9-NEXT: v_pk_max_f16 v2, v22, v22
-; GFX9-NEXT: v_pk_min_f16 v12, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v7, v7
-; GFX9-NEXT: v_pk_max_f16 v2, v23, v23
-; GFX9-NEXT: v_pk_min_f16 v14, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_mov_b32_e32 v2, v16
-; GFX9-NEXT: v_mov_b32_e32 v4, v18
-; GFX9-NEXT: v_mov_b32_e32 v6, v17
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minnum_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v8, v16, v16
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v9, v17, v17
-; GFX10-NEXT: v_pk_max_f16 v10, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v11, v18, v18
-; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_min_f16 v0, v0, v8
-; GFX10-NEXT: v_pk_min_f16 v2, v1, v9
-; GFX10-NEXT: v_pk_min_f16 v16, v10, v11
-; GFX10-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v19, v19
-; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v8, v20, v20
-; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
-; GFX10-NEXT: v_pk_max_f16 v9, v21, v21
-; GFX10-NEXT: v_pk_max_f16 v11, v6, v6
-; GFX10-NEXT: v_pk_max_f16 v12, v22, v22
-; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
-; GFX10-NEXT: v_pk_max_f16 v13, v23, v23
-; GFX10-NEXT: v_pk_min_f16 v6, v1, v3
-; GFX10-NEXT: v_pk_min_f16 v8, v4, v8
-; GFX10-NEXT: v_pk_min_f16 v10, v5, v9
-; GFX10-NEXT: v_pk_min_f16 v12, v11, v12
-; GFX10-NEXT: v_pk_min_f16 v14, v7, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT: v_mov_b32_e32 v4, v16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
- ret <32 x bfloat> %op
-}
-
-
-declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
-declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
-declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
-declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
-
-define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_maxnum_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_max_f32_e32 v0, v0, v2
-; GCN-NEXT: v_max_f32_e32 v1, v1, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v2, v2, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_max_f32_e32 v0, v0, v3
-; GCN-NEXT: v_max_f32_e32 v1, v1, v4
-; GCN-NEXT: v_max_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_max_f32_e32 v0, v0, v4
-; GCN-NEXT: v_max_f32_e32 v1, v1, v5
-; GCN-NEXT: v_max_f32_e32 v2, v2, v6
-; GCN-NEXT: v_max_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v3, v1, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
- ret <4 x bfloat> %op
-}
-
-define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v8bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: v_max_f32_e32 v0, v0, v8
-; GCN-NEXT: v_max_f32_e32 v1, v1, v9
-; GCN-NEXT: v_max_f32_e32 v2, v2, v10
-; GCN-NEXT: v_max_f32_e32 v3, v3, v11
-; GCN-NEXT: v_max_f32_e32 v4, v4, v12
-; GCN-NEXT: v_max_f32_e32 v5, v5, v13
-; GCN-NEXT: v_max_f32_e32 v6, v6, v14
-; GCN-NEXT: v_max_f32_e32 v7, v7, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v8bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v10
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v12
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v13
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v14
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v15
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v8bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v2, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX8-NEXT: v_max_f16_e32 v6, v2, v3
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v2
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v2, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v8bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v2, v5, v5
-; GFX9-NEXT: v_pk_max_f16 v2, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v8bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v2, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v3, v5, v5
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
- ret <8 x bfloat> %op
-}
-
-define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v16bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_max_f32_e32 v0, v0, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT: v_max_f32_e32 v1, v1, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GCN-NEXT: v_max_f32_e32 v2, v2, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GCN-NEXT: v_max_f32_e32 v3, v3, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v20
-; GCN-NEXT: v_max_f32_e32 v4, v4, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21
-; GCN-NEXT: v_max_f32_e32 v5, v5, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v22
-; GCN-NEXT: v_max_f32_e32 v6, v6, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v23
-; GCN-NEXT: v_max_f32_e32 v7, v7, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24
-; GCN-NEXT: v_max_f32_e32 v8, v8, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v25
-; GCN-NEXT: v_max_f32_e32 v9, v9, v16
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT: v_max_f32_e32 v10, v10, v16
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v27
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v29
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v30
-; GCN-NEXT: v_max_f32_e32 v11, v11, v17
-; GCN-NEXT: v_max_f32_e32 v12, v12, v18
-; GCN-NEXT: v_max_f32_e32 v13, v13, v19
-; GCN-NEXT: v_max_f32_e32 v14, v14, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: v_max_f32_e32 v15, v15, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v16bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v21
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v22
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v16
-; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v23
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v24
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v25
-; GFX7-NEXT: v_max_f32_e32 v8, v8, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v26
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v27
-; GFX7-NEXT: v_max_f32_e32 v10, v10, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v28
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v29
-; GFX7-NEXT: v_max_f32_e32 v12, v12, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v30
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_max_f32_e32 v13, v13, v18
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_max_f32_e32 v14, v14, v17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v16bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v5, v8, v8
-; GFX8-NEXT: v_max_f16_e32 v12, v4, v5
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v4, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v8, v0, v4
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v4, v9, v9
-; GFX8-NEXT: v_max_f16_e32 v13, v0, v4
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v9, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v1, v10, v10
-; GFX8-NEXT: v_max_f16_e32 v4, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v5, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v11, v11
-; GFX8-NEXT: v_max_f16_e32 v6, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v12
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v9
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX9-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX9-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v4, v8, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX9-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v4
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v4, v9, v9
-; GFX9-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v8, v1, v4
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v2, v10, v10
-; GFX9-NEXT: v_pk_max_f16 v4, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT: v_pk_max_f16 v2, v11, v11
-; GFX9-NEXT: v_pk_max_f16 v6, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v16bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX10-NEXT: v_mov_b32_sdwa v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v10, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v4, v8, v8
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v5, v9, v9
-; GFX10-NEXT: v_pk_max_f16 v6, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v7, v10, v10
-; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v4
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v5
-; GFX10-NEXT: v_pk_max_f16 v4, v6, v7
-; GFX10-NEXT: v_pk_max_f16 v6, v3, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
- ret <16 x bfloat> %op
-}
-
-define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v32bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_max_f32_e32 v0, v0, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_max_f32_e32 v1, v1, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v2, v2, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_max_f32_e32 v3, v3, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v4, v4, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT: v_max_f32_e32 v5, v5, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v6, v6, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_max_f32_e32 v7, v7, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v8, v8, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT: v_max_f32_e32 v9, v9, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v10, v10, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT: v_max_f32_e32 v11, v11, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v12, v12, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: v_max_f32_e32 v13, v13, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v14, v14, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_max_f32_e32 v15, v15, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v16, v16, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_max_f32_e32 v17, v17, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v18, v18, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT: v_max_f32_e32 v19, v19, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v20, v20, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT: v_max_f32_e32 v21, v21, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v22, v22, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_max_f32_e32 v23, v23, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v24, v24, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_max_f32_e32 v25, v25, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v26, v26, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: v_max_f32_e32 v27, v27, v31
-; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GCN-NEXT: v_max_f32_e32 v28, v28, v32
-; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_max_f32_e32 v29, v29, v31
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GCN-NEXT: v_max_f32_e32 v30, v30, v31
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v32, v33
-; GCN-NEXT: v_max_f32_e32 v31, v31, v32
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_maxnum_v32bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
-; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
-; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
-; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
-; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
-; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
-; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
-; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
-; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
-; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
-; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
-; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
-; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
-; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
-; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
-; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
-; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
-; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
-; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v1, v1, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v7, v7, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v8, v8, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v10, v10, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v12, v12, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v13, v13, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v14, v14, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
-; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v16, v16, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
-; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v17, v17, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
-; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v18, v18, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
-; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v19, v19, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
-; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v20, v20, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
-; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v21, v21, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
-; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v22, v22, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
-; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
-; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v24, v24, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
-; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v25, v25, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
-; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v26, v26, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
-; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v27, v27, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v28, v28, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
-; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v29, v29, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
-; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v30, v30, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
-; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maxnum_v32bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v8, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v9, v16, v16
-; GFX8-NEXT: v_max_f16_e32 v24, v8, v9
-; GFX8-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v8, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v16, v0, v8
-; GFX8-NEXT: v_max_f16_e32 v0, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v8, v17, v17
-; GFX8-NEXT: v_max_f16_e32 v25, v0, v8
-; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v17, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v2, v2
-; GFX8-NEXT: v_max_f16_e32 v1, v18, v18
-; GFX8-NEXT: v_max_f16_e32 v26, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v18, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v1, v19, v19
-; GFX8-NEXT: v_max_f16_e32 v27, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v19, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v4, v4
-; GFX8-NEXT: v_max_f16_e32 v1, v20, v20
-; GFX8-NEXT: v_max_f16_e32 v8, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v9, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v5, v5
-; GFX8-NEXT: v_max_f16_e32 v1, v21, v21
-; GFX8-NEXT: v_max_f16_e32 v10, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v11, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v6, v6
-; GFX8-NEXT: v_max_f16_e32 v1, v22, v22
-; GFX8-NEXT: v_max_f16_e32 v12, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v13, v0, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v7, v7
-; GFX8-NEXT: v_max_f16_e32 v1, v23, v23
-; GFX8-NEXT: v_max_f16_e32 v14, v0, v1
-; GFX8-NEXT: v_max_f16_sdwa v0, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_sdwa v1, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_max_f16_e32 v15, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, v16
-; GFX8-NEXT: v_mov_b32_e32 v2, v25
-; GFX8-NEXT: v_mov_b32_e32 v3, v17
-; GFX8-NEXT: v_mov_b32_e32 v4, v26
-; GFX8-NEXT: v_mov_b32_e32 v5, v18
-; GFX8-NEXT: v_mov_b32_e32 v6, v27
-; GFX8-NEXT: v_mov_b32_e32 v7, v19
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maxnum_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX9-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX9-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v8, v16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX9-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v8
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: v_pk_max_f16 v8, v17, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX9-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v16, v1, v8
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v2
-; GFX9-NEXT: v_pk_max_f16 v2, v18, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v18, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX9-NEXT: v_pk_max_f16 v2, v19, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22
-; GFX9-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v21, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v17, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v4, v4
-; GFX9-NEXT: v_pk_max_f16 v2, v20, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23
-; GFX9-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v22, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v8, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v5, v5
-; GFX9-NEXT: v_pk_max_f16 v2, v21, v21
-; GFX9-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v23, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_max_f16 v10, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v6, v6
-; GFX9-NEXT: v_pk_max_f16 v2, v22, v22
-; GFX9-NEXT: v_pk_max_f16 v12, v1, v2
-; GFX9-NEXT: v_pk_max_f16 v1, v7, v7
-; GFX9-NEXT: v_pk_max_f16 v2, v23, v23
-; GFX9-NEXT: v_pk_max_f16 v14, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT: v_mov_b32_e32 v2, v16
-; GFX9-NEXT: v_mov_b32_e32 v4, v18
-; GFX9-NEXT: v_mov_b32_e32 v6, v17
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maxnum_v32bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v21
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v23
-; GFX10-NEXT: v_mov_b32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v16, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v17, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v18, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v20, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v21, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX10-NEXT: v_pk_max_f16 v8, v16, v16
-; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX10-NEXT: v_pk_max_f16 v9, v17, v17
-; GFX10-NEXT: v_pk_max_f16 v10, v2, v2
-; GFX10-NEXT: v_pk_max_f16 v11, v18, v18
-; GFX10-NEXT: v_mov_b32_sdwa v22, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_max_f16 v0, v0, v8
-; GFX10-NEXT: v_pk_max_f16 v2, v1, v9
-; GFX10-NEXT: v_pk_max_f16 v16, v10, v11
-; GFX10-NEXT: v_pk_max_f16 v1, v3, v3
-; GFX10-NEXT: v_pk_max_f16 v3, v19, v19
-; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
-; GFX10-NEXT: v_pk_max_f16 v8, v20, v20
-; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
-; GFX10-NEXT: v_pk_max_f16 v9, v21, v21
-; GFX10-NEXT: v_pk_max_f16 v11, v6, v6
-; GFX10-NEXT: v_pk_max_f16 v12, v22, v22
-; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
-; GFX10-NEXT: v_pk_max_f16 v13, v23, v23
-; GFX10-NEXT: v_pk_max_f16 v6, v1, v3
-; GFX10-NEXT: v_pk_max_f16 v8, v4, v8
-; GFX10-NEXT: v_pk_max_f16 v10, v5, v9
-; GFX10-NEXT: v_pk_max_f16 v12, v11, v12
-; GFX10-NEXT: v_pk_max_f16 v14, v7, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX10-NEXT: v_mov_b32_e32 v4, v16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
- ret <32 x bfloat> %op
-}
-
-declare bfloat @llvm.sqrt.bf16(bfloat)
-
-define bfloat @v_sqrt_bf16(bfloat %a) {
-; GCN-LABEL: v_sqrt_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_sqrt_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_sqrt_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_sqrt_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_sqrt_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_sqrt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_sqrt_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
-
-define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
-; GCN-LABEL: v_ldexp_bf16_i32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_ldexp_bf16_i32:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_ldexp_bf16_i32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff8000
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fff
-; GFX8-NEXT: v_med3_i32 v1, v1, v2, v3
-; GFX8-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_ldexp_bf16_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff8000
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff
-; GFX9-NEXT: v_med3_i32 v1, v1, v2, v3
-; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_ldexp_bf16_i32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x7fff
-; GFX10-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
- ret bfloat %op
-}
-
-declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
-
-define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
-; GCN-LABEL: v_frexp_bf16_i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT: v_frexp_mant_f32_e32 v2, v0
-; GCN-NEXT: v_frexp_exp_i32_f32_e32 v3, v0
-; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_frexp_bf16_i16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_frexp_bf16_i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX8-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_frexp_bf16_i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX9-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_frexp_bf16_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX10-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
- ret { bfloat, i16 } %op
-}
-
-
-declare bfloat @llvm.log.bf16(bfloat)
-declare bfloat @llvm.log2.bf16(bfloat)
-declare bfloat @llvm.log10.bf16(bfloat)
-
-define bfloat @v_log_bf16(bfloat %a) {
-; GCN-LABEL: v_log_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_log_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.log.bf16(bfloat %a)
- ret bfloat %op
-}
-
-define bfloat @v_log2_bf16(bfloat %a) {
-; GCN-LABEL: v_log2_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log2_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_log_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log2_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log2_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log2_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.log2.bf16(bfloat %a)
- ret bfloat %op
-}
-
-define bfloat @v_log10_bf16(bfloat %a) {
-; GCN-LABEL: v_log10_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_log10_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_log_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_log10_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_log10_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_log_f16_e32 v0, v0
-; GFX9-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_log10_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_log_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.log10.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.exp.bf16(bfloat)
-declare bfloat @llvm.exp2.bf16(bfloat)
-declare bfloat @llvm.exp10.bf16(bfloat)
-
-define bfloat @v_exp_bf16(bfloat %a) {
-; GCN-LABEL: v_exp_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.exp.bf16(bfloat %a)
- ret bfloat %op
-}
-
-define bfloat @v_exp2_bf16(bfloat %a) {
-; GCN-LABEL: v_exp2_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp2_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp2_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp2_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_exp_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp2_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_exp_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.exp2.bf16(bfloat %a)
- ret bfloat %op
-}
-
-define bfloat @v_exp10_bf16(bfloat %a) {
-; GCN-LABEL: v_exp10_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_exp10_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_exp10_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_exp10_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_exp10_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.exp10.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.ceil.bf16(bfloat)
-
-define bfloat @v_ceil_bf16(bfloat %a) {
-; GCN-LABEL: v_ceil_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_ceil_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_ceil_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_ceil_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_ceil_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_ceil_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_ceil_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ceil_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_ceil_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_ceil_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.ceil.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.trunc.bf16(bfloat)
-
-define bfloat @v_trunc_bf16(bfloat %a) {
-; GCN-LABEL: v_trunc_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_trunc_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_trunc_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_trunc_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_trunc_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_trunc_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_trunc_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_trunc_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.trunc.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.rint.bf16(bfloat)
-
-define bfloat @v_rint_bf16(bfloat %a) {
-; GCN-LABEL: v_rint_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_rint_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_rint_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rndne_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_rint_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_rint_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.rint.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.nearbyint.bf16(bfloat)
-
-; FIXME: unable to legalize instruction: %2:_(s16) = G_FNEARBYINT %0:_
-; define bfloat @v_nearbyint_bf16(bfloat %a) {
-; %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
-; ret bfloat %op
-; }
-
-declare bfloat @llvm.round.bf16(bfloat)
-
-define bfloat @v_round_bf16(bfloat %a) {
-; GCN-LABEL: v_round_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, 0.5
-; GCN-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GCN-NEXT: v_trunc_f32_e32 v4, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e64 v5, -v4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v1, v2
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_add_f32_e32 v0, v1, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_round_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x3c00
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX7-NEXT: v_trunc_f32_e32 v2, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e64 v3, -v2
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, 0.5
-; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v1, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_round_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f16_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_round_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f16_e32 v1, v0
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_round_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_trunc_f16_e32 v1, v0
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.round.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.roundeven.bf16(bfloat)
-
-define bfloat @v_roundeven_bf16(bfloat %a) {
-; GCN-LABEL: v_roundeven_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_roundeven_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_roundeven_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rndne_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_roundeven_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_roundeven_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.floor.bf16(bfloat)
-
-define bfloat @v_floor_bf16(bfloat %a) {
-; GCN-LABEL: v_floor_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_floor_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_floor_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_floor_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_floor_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_floor_f16_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_floor_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_floor_f16_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_floor_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_floor_f16_e32 v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.floor.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.canonicalize.bf16(bfloat)
-
-define bfloat @v_canonicalize_bf16(bfloat %a) {
-; GCN-LABEL: v_canonicalize_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_canonicalize_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_canonicalize_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_canonicalize_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_canonicalize_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
- ret bfloat %op
-}
-
-declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
-
-; FIXME: Promotion broken
-; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
-; %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
-; ret bfloat %op
-; }
-
-define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_false_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_false_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, 0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_false_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_false_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_false_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp false bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_oeq_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_oeq_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_oeq_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_oeq_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_oeq_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp oeq bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ogt_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ogt_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ogt_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ogt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ogt_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ogt bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_oge_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_oge_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_oge_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_oge_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_oge_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp oge bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_olt_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_olt_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_olt_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_olt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_olt_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp olt bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ole_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ole_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ole_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ole_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ole_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ole bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_one_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_one_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_one_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_one_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_one_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp one bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_uno_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_uno_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_uno_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_uno_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_u_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_uno_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp uno bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ueq_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ueq_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ueq_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ueq_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ueq_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ueq bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ugt_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ugt_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ugt_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ugt_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ugt_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ugt bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_uge_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_uge_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_uge_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_uge_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_uge_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp uge bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ult_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ult_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ult_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ult_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ult_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ult bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_ule_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_ule_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_ule_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_ule_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_ule_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp ule bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_une_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_une_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_une_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_une_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_une_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp une bfloat %a, %b
- ret i1 %op
-}
-
-define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fcmp_true_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fcmp_true_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, 1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fcmp_true_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fcmp_true_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fcmp_true_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = fcmp true bfloat %a, %b
- ret i1 %op
-}
-
-declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
-
-define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
-; GCN-LABEL: v_copysign_bf16_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_bf16_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_bf16_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_bf16_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_bf16_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
- ret bfloat %op
-}
-
-; FIXME: unable to lower arguments: ptr
-; define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; ret bfloat %op
-; }
-
-; FIXME: unable to lower arguments: ptr
-; define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
-; %sign = fptrunc float %sign.f32 to bfloat
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
-; %sign = fptrunc double %sign.f64 to bfloat
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; ret bfloat %op
-; }
-
-define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
-; GCN-LABEL: v_copysign_bf16_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_bf16_f16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_bf16_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_bf16_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_bf16_f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %sign = bitcast half %sign.f16 to bfloat
- %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
- ret bfloat %op
-}
-
-define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
-; GCN-LABEL: s_copysign_bf16_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_bf16_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_bf16_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_bf16_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_bf16_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-; FIXME: unable to translate instruction: fptrunc
-; define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
-; %sign = fptrunc float %sign.f32 to bfloat
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; %cast = bitcast bfloat %op to i16
-; %zext = zext i16 %cast to i32
-; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-; ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: fptrunc
-; define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
-; %sign = fptrunc double %sign.f64 to bfloat
-; %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
-; %cast = bitcast bfloat %op to i16
-; %zext = zext i16 %cast to i32
-; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-; ret i32 %readlane
-; }
-
-define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
-; GCN-LABEL: s_copysign_bf16_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_bf16_f16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_bf16_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_bf16_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_bf16_f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %sign = bitcast half %sign.f16 to bfloat
- %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-declare float @llvm.copysign.f32(float, float)
-
-; FIXME: unable to translate instruction: fpext
-; define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
-; %sign = fpext bfloat %sign.bf16 to float
-; %op = call float @llvm.copysign.f32(float %mag, float %sign)
-; ret float %op
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
-; %sign = fpext bfloat %sign.bf16 to float
-; %op = call float @llvm.copysign.f32(float %mag, float %sign)
-; %cast = bitcast float %op to i32
-; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-; ret i32 %readlane
-; }
-
-declare half @llvm.copysign.f16(half, half)
-
-define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
-; GCN-LABEL: v_copysign_f16_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_copysign_f16_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_copysign_f16_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_copysign_f16_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_copysign_f16_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %sign = bitcast bfloat %sign.bf16 to half
- %op = call half @llvm.copysign.f16(half %mag, half %sign)
- ret half %op
-}
-
-define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
-; GCN-LABEL: s_copysign_f16_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
-; GCN-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_and_b32 s0, 0xffff, s0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_copysign_f16_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_copysign_f16_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX8-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_copysign_f16_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX9-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_copysign_f16_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX10-NEXT: s_and_b32 s1, s1, 0xffff8000
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: ; return to shader part epilog
- %sign = bitcast bfloat %sign.bf16 to half
- %op = call half @llvm.copysign.f16(half %mag, half %sign)
- %cast = bitcast half %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-declare double @llvm.copysign.f64(double, double)
-
-; FIXME: unable to translate instruction: fpext
-; define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
-; %sign = fpext bfloat %sign.bf16 to double
-; %op = call double @llvm.copysign.f64(double %mag, double %sign)
-; ret double %op
-; }
-
-; FIXME: unable to translate instruction: fpext
-; define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
-; %sign = fpext bfloat %sign.bf16 to double
-; %op = call double @llvm.copysign.f64(double %mag, double %sign)
-; %cast = bitcast double %op to <2 x i32>
-; %cast.0 = extractelement <2 x i32> %cast, i32 0
-; %cast.1 = extractelement <2 x i32> %cast, i32 1
-; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
-; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
-; %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-; %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
-; ret <2 x i32> %ins.1
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
-; %op = fptosi bfloat %x to i16
-; ret i16 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
-; %op = fptosi <2 x bfloat> %x to <2 x i16>
-; ret <2 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
-; %op = fptosi <3 x bfloat> %x to <3 x i16>
-; ret <3 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
-; %op = fptosi <4 x bfloat> %x to <4 x i16>
-; ret <4 x i16> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
-; %op = fptosi bfloat %x to i32
-; ret i32 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
-; %op = fptosi <2 x bfloat> %x to <2 x i32>
-; ret <2 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
-; %op = fptosi <3 x bfloat> %x to <3 x i32>
-; ret <3 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
-; %op = fptosi <4 x bfloat> %x to <4 x i32>
-; ret <4 x i32> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
-; %op = fptosi bfloat %x to i64
-; ret i64 %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
-; %op = fptosi <2 x bfloat> %x to <2 x i64>
-; ret <2 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
-; %op = fptosi <3 x bfloat> %x to <3 x i64>
-; ret <3 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: fptosi
-; define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
-; %op = fptosi <4 x bfloat> %x to <4 x i64>
-; ret <4 x i64> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
-; %op = sitofp i16 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
-; %op = sitofp <2 x i16> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
-; %op = sitofp <3 x i16> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
-; %op = sitofp <4 x i16> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
-; %op = sitofp i32 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
-; %op = sitofp <2 x i32> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
-; %op = sitofp <3 x i32> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
-; %op = sitofp <4 x i32> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
-; %op = sitofp i64 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
-; %op = sitofp <2 x i64> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
-; %op = sitofp <3 x i64> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: sitofp
-; define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
-; %op = sitofp <4 x i64> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
-; %op = uitofp i16 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
-; %op = uitofp <2 x i16> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
-; %op = uitofp <3 x i16> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
-; %op = uitofp <4 x i16> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
-; %op = uitofp i32 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
-; %op = uitofp <2 x i32> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
-; %op = uitofp <3 x i32> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
-; %op = uitofp <4 x i32> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
-; %op = uitofp i64 %x to bfloat
-; ret bfloat %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
-; %op = uitofp <2 x i64> %x to <2 x bfloat>
-; ret <2 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
-; %op = uitofp <3 x i64> %x to <3 x bfloat>
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: uitofp
-; define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
-; %op = uitofp <4 x i64> %x to <4 x bfloat>
-; ret <4 x bfloat> %op
-; }
-
-define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = select i1 %cond, bfloat %a, bfloat %b
- ret bfloat %op
-}
-
-define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_fneg_lhs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_fneg_lhs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_fneg_lhs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_fneg_lhs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_fneg_lhs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %neg.a = fneg bfloat %a
- %op = select i1 %cond, bfloat %neg.a, bfloat %b
- ret bfloat %op
-}
-
-define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
-; GCN-LABEL: v_select_fneg_rhs_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_fneg_rhs_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_fneg_rhs_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_fneg_rhs_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_fneg_rhs_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %neg.b = fneg bfloat %b
- %op = select i1 %cond, bfloat %a, bfloat %neg.b
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_select_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_or_b32_e32 v1, v2, v1
-; GCN-NEXT: v_or_b32_e32 v2, v4, v3
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_select_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_select_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_select_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_select_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
- ret <2 x bfloat> %op
-}
-
-define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_vselect_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_vselect_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_vselect_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_vselect_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_vselect_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
- ret <2 x bfloat> %op
-}
-
-define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
-; GCN-LABEL: s_select_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
-;
-; GFX7-LABEL: s_select_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_select_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_select_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_select_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
- %cond = icmp eq i32 %c, 0
- %op = select i1 %cond, bfloat %a, bfloat %b
- %cast = bitcast bfloat %op to i16
- %zext = zext i16 %cast to i32
- %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
- ret i32 %readlane
-}
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
-; %cond = icmp eq i32 %c, 0
-; %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
-; %cast = bitcast <2 x bfloat> %op to i32
-; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-; ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
-; %cond = icmp eq <2 x i32> %c, zeroinitializer
-; %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
-; %cast = bitcast <2 x bfloat> %op to i32
-; %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
-; ret i32 %readlane
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
-; %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
-; ret <3 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
-; %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
-; %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
-; ret <6 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
-; %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
-; ret <8 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
-; %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
-; ret <16 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
-; %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
-; ret <32 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
-; %cond = icmp eq i32 %c, 0
-; %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
-; %cast = bitcast <3 x bfloat> %op to i48
-; %elt0 = trunc i48 %cast to i32
-; %elt1.hi = lshr i48 %cast, 32
-; %elt1 = trunc i48 %elt1.hi to i32
-; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-; ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
-; %cond = icmp eq i32 %c, 0
-; %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
-; %cast = bitcast <4 x bfloat> %op to <2 x i32>
-; %elt0 = extractelement <2 x i32> %cast, i32 0
-; %elt1 = extractelement <2 x i32> %cast, i32 1
-; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-; ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
-; %cond = icmp eq <4 x i32> %c, zeroinitializer
-; %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
-; %cast = bitcast <4 x bfloat> %op to <2 x i32>
-; %elt0 = extractelement <2 x i32> %cast, i32 0
-; %elt1 = extractelement <2 x i32> %cast, i32 1
-; %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
-; %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
-; %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
-; %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
-; ret <2 x i32> %bv.1
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
-; %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
-; ret <4 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
-; %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
-; ret <8 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
-; %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
-; ret <16 x bfloat> %op
-; }
-
-; FIXME: unable to translate instruction: bitcast
-; define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
-; %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
-; ret <32 x bfloat> %op
-; }
-
-declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
-declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
-
-define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
-; GCN-LABEL: v_fma_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
-; GCN-LABEL: v_fma_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
-; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_fma_f32 v0, v0, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
-; GCN-LABEL: v_fma_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
-; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
-; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_fma_f32 v0, v0, v3, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_fma_f32 v1, v1, v3, v4
-; GFX7-NEXT: v_fma_f32 v2, v2, v5, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_bfi_b32 v2, s4, v4, v4
-; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_bfi_b32 v2, 0xffff, v4, v4
-; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
-; GCN-LABEL: v_fma_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
-; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
-; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
-; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fma_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GFX7-NEXT: v_fma_f32 v0, v0, v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_fma_f32 v1, v1, v5, v9
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v11
-; GFX7-NEXT: v_fma_f32 v2, v2, v4, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_fma_f32 v3, v3, v6, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fma_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fma_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fma_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
- ret <4 x bfloat> %op
-}
-
-declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
-declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
-declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
-declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
-
-define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
-; GCN-LABEL: v_fmuladd_bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX8-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX10-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
- ret bfloat %op
-}
-
-define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v2bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_add_f32_e32 v0, v0, v4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v2bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v2bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v3, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v1, v3, v2
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v2bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
- ret <2 x bfloat> %op
-}
-
-define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v3bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_add_f32_e32 v0, v0, v6
-; GCN-NEXT: v_add_f32_e32 v1, v1, v7
-; GCN-NEXT: v_add_f32_e32 v2, v2, v8
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v3bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v7
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v8
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v3bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v4
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v3bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: v_bfi_b32 v1, s4, v4, v4
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v3bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v2, v2
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v4, v4
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
- ret <3 x bfloat> %op
-}
-
-define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
-; GCN-LABEL: v_fmuladd_v4bf16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT: v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT: v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT: v_add_f32_e32 v0, v0, v8
-; GCN-NEXT: v_add_f32_e32 v1, v1, v9
-; GCN-NEXT: v_add_f32_e32 v2, v2, v10
-; GCN-NEXT: v_add_f32_e32 v3, v3, v11
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: v_fmuladd_v4bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v6
-; GFX7-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v9
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v4bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v4
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fmuladd_v4bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_mov_b32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX9-NEXT: v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmuladd_v4bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_mov_b32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX10-NEXT: v_pk_add_f16 v0, v0, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
- ret <4 x bfloat> %op
-}
More information about the llvm-commits
mailing list