[llvm] b01adc6 - AMDGPU: Strengthen some bfloat tests

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 20 04:34:18 PST 2023


Author: Matt Arsenault
Date: 2023-12-20T19:33:45+07:00
New Revision: b01adc6bed7e5b924dd8a097be0aa893f4823905

URL: https://github.com/llvm/llvm-project/commit/b01adc6bed7e5b924dd8a097be0aa893f4823905
DIFF: https://github.com/llvm/llvm-project/commit/b01adc6bed7e5b924dd8a097be0aa893f4823905.diff

LOG: AMDGPU: Strengthen some bfloat tests

Fix bitcast test, which was splitting apart phis intended to force
bitcasts that survive all the way to selection.

Disable the amdgpu-codegenprepare phi splitting, which defeats the technique
of using a phi to ensure a bitcast reaches all the way to selection. Also
add a variety of bfloat tests. These probably need revisiting to avoid the
cast folding into argument loads. Also round out set of bfloat bitcast and
ABI tests.

Add codegen tests for more bf16 operations The promotion of these works
contrary to the comment.

Added: 
    llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll

Modified: 
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
    llvm/test/CodeGen/AMDGPU/select-undef.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 1b22df80c2678b..5f048ce09157f3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck %s
 
 ; This test just checks that the compiler doesn't crash.
 
@@ -857,3 +857,1258 @@ end:
   store <30 x i32> %phi_cast, ptr addrspace(1) %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_i32:
+define void @v_bitcast_v2bf16_to_i32(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x bfloat> %value to i32
+  br label %end
+
+end:
+  %phi = phi i32 [0, %entry], [%cast, %if]
+  store i32 %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2i16:
+define void @v_bitcast_v2bf16_to_v2i16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x bfloat> %value to <2 x i16>
+  br label %end
+
+end:
+  %phi = phi <2 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v2f16:
+define void @v_bitcast_v2bf16_to_v2f16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x bfloat> %value to <2 x half>
+  br label %end
+
+end:
+  %phi = phi <2 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2bf16_to_v4i8:
+define void @v_bitcast_v2bf16_to_v4i8(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x bfloat> %value to <4 x i8>
+  br label %end
+
+end:
+  %phi = phi <4 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3i16:
+define void @v_bitcast_v3bf16_to_v3i16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <3 x bfloat> %value to <3 x i16>
+  br label %end
+
+end:
+  %phi = phi <3 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <3 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v3bf16_to_v3f16:
+define void @v_bitcast_v3bf16_to_v3f16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <3 x bfloat> %value to <3 x half>
+  br label %end
+
+end:
+  %phi = phi <3 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <3 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_i32_to_v2bf16:
+define void @v_bitcast_i32_to_v2bf16(i32 %cond, ptr addrspace(1) %out, i32 %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast i32 %value to <2 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2i16_to_v2bf16:
+define void @v_bitcast_v2i16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x i16> %value to <2 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2f16_to_v2bf16:
+define void @v_bitcast_v2f16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x half> %value to <2 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4i8_to_v2bf16:
+define void @v_bitcast_v4i8_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <4 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x i8> %value to <2 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v3i16_to_v3bf16:
+define void @v_bitcast_v3i16_to_v3bf16(i32 %cond, ptr addrspace(1) %out, <3 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <3 x i16> %value to <3 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <3 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <3 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4f16:
+define void @v_bitcast_v4bf16_to_v4f16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to <4 x half>
+  br label %end
+
+end:
+  %phi = phi <4 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v4i16:
+define void @v_bitcast_v4bf16_to_v4i16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to <4 x i16>
+  br label %end
+
+end:
+  %phi = phi <4 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2i32:
+define void @v_bitcast_v4bf16_to_v2i32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to <2 x i32>
+  br label %end
+
+end:
+  %phi = phi <2 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x i32> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v2f32:
+define void @v_bitcast_v4bf16_to_v2f32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to <2 x float>
+  br label %end
+
+end:
+  %phi = phi <2 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x float> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_f64:
+define void @v_bitcast_v4bf16_to_f64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to double
+  br label %end
+
+end:
+  %phi = phi double [0.0, %entry], [%cast, %if]
+  store double %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_i64:
+define void @v_bitcast_v4bf16_to_i64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to i64
+  br label %end
+
+end:
+  %phi = phi i64 [0, %entry], [%cast, %if]
+  store i64 %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4bf16_to_v8i8:
+define void @v_bitcast_v4bf16_to_v8i8(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x bfloat> %value to <8 x i8>
+  br label %end
+
+end:
+  %phi = phi <8 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_i64_to_v4bf16:
+define void @v_bitcast_i64_to_v4bf16(i32 %cond, ptr addrspace(1) %out, i64 %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast i64 %value to <4 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2f32_to_v4bf16:
+define void @v_bitcast_v2f32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x float> %value to <4 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2i32_to_v4bf16:
+define void @v_bitcast_v2i32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x i32> %value to <4 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4i16_to_v4bf16:
+define void @v_bitcast_v4i16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x i16> %value to <4 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4f16_to_v4bf16:
+define void @v_bitcast_v4f16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x half> %value to <4 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6i16:
+define void @v_bitcast_v6bf16_to_v6i16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <6 x bfloat> %value to <6 x i16>
+  br label %end
+
+end:
+  %phi = phi <6 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v6f16:
+define void @v_bitcast_v6bf16_to_v6f16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <6 x bfloat> %value to <6 x half>
+  br label %end
+
+end:
+  %phi = phi <6 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v6bf16_to_v12i8:
+define void @v_bitcast_v6bf16_to_v12i8(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <6 x bfloat> %value to <12 x i8>
+  br label %end
+
+end:
+  %phi = phi <12 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <12 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v6f16_to_v6bf16:
+define void @v_bitcast_v6f16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <6 x half> %value to <6 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v6i16_to_v6bf16:
+define void @v_bitcast_v6i16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <6 x i16> %value to <6 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v12i8_to_v6bf16:
+define void @v_bitcast_v12i8_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <12 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <12 x i8> %value to <6 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <6 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2f64:
+define void @v_bitcast_v8bf16_to_v2f64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <2 x double>
+  br label %end
+
+end:
+  %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x double> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v2i64:
+define void @v_bitcast_v8bf16_to_v2i64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <2 x i64>
+  br label %end
+
+end:
+  %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <2 x i64> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4f32:
+define void @v_bitcast_v8bf16_to_v4f32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <4 x float>
+  br label %end
+
+end:
+  %phi = phi <4 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x float> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v4i32:
+define void @v_bitcast_v8bf16_to_v4i32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <4 x i32>
+  br label %end
+
+end:
+  %phi = phi <4 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x i32> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8f16:
+define void @v_bitcast_v8bf16_to_v8f16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <8 x half>
+  br label %end
+
+end:
+  %phi = phi <8 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8bf16_to_v8i16:
+define void @v_bitcast_v8bf16_to_v8i16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x bfloat> %value to <8 x i16>
+  br label %end
+
+end:
+  %phi = phi <8 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8f16_to_v8bf16:
+define void @v_bitcast_v8f16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x half> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8i16_to_v8bf16:
+define void @v_bitcast_v8i16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x i16> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16i8_to_v8bf16:
+define void @v_bitcast_v16i8_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <16 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x i8> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2i64_to_v8bf16:
+define void @v_bitcast_v2i64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x i64> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v2f64_to_v8bf16:
+define void @v_bitcast_v2f64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <2 x double> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4i32_to_v8bf16:
+define void @v_bitcast_v4i32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x i32> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4f32_to_v8bf16:
+define void @v_bitcast_v4f32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x float> %value to <8 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16i16:
+define void @v_bitcast_v16bf16_to_v16i16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <16 x i16>
+  br label %end
+
+end:
+  %phi = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v16f16:
+define void @v_bitcast_v16bf16_to_v16f16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <16 x half>
+  br label %end
+
+end:
+  %phi = phi <16 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8i32:
+define void @v_bitcast_v16bf16_to_v8i32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <8 x i32>
+  br label %end
+
+end:
+  %phi = phi <8 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x i32> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v8f32:
+define void @v_bitcast_v16bf16_to_v8f32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <8 x float>
+  br label %end
+
+end:
+  %phi = phi <8 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x float> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4f64:
+define void @v_bitcast_v16bf16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <4 x double>
+  br label %end
+
+end:
+  %phi = phi <4 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x double> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v4i64:
+define void @v_bitcast_v16bf16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <4 x i64>
+  br label %end
+
+end:
+  %phi = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <4 x i64> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16bf16_to_v32i8:
+define void @v_bitcast_v16bf16_to_v32i8(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x bfloat> %value to <32 x i8>
+  br label %end
+
+end:
+  %phi = phi <32 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8f32_to_v16bf16:
+define void @v_bitcast_v8f32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x float> %value to <16 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8i32_to_v16bf16:
+define void @v_bitcast_v8i32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x i32> %value to <16 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4i64_to_v16bf16:
+define void @v_bitcast_v4i64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x i64> %value to <16 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v4f64_to_v16bf16:
+define void @v_bitcast_v4f64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <4 x double> %value to <16 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32i8_to_v16bf16:
+define void @v_bitcast_v32i8_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <32 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x i8> %value to <16 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8i64:
+define void @v_bitcast_v32bf16_to_v8i64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <8 x i64>
+  br label %end
+
+end:
+  %phi = phi <8 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x i64> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v8f64:
+define void @v_bitcast_v32bf16_to_v8f64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <8 x double>
+  br label %end
+
+end:
+  %phi = phi <8 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <8 x double> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16i32:
+define void @v_bitcast_v32bf16_to_v16i32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <16 x i32>
+  br label %end
+
+end:
+  %phi = phi <16 x i32> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x i32> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v16f32:
+define void @v_bitcast_v32bf16_to_v16f32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <16 x float>
+  br label %end
+
+end:
+  %phi = phi <16 x float> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x float> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32f16:
+define void @v_bitcast_v32bf16_to_v32f16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <32 x half>
+  br label %end
+
+end:
+  %phi = phi <32 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v32i16:
+define void @v_bitcast_v32bf16_to_v32i16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <32 x i16>
+  br label %end
+
+end:
+  %phi = phi <32 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32bf16_to_v64i8:
+define void @v_bitcast_v32bf16_to_v64i8(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x bfloat> %value to <64 x i8>
+  br label %end
+
+end:
+  %phi = phi <64 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64i8_to_v32bf16:
+define void @v_bitcast_v64i8_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <64 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x i8> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32i16_to_v32bf16:
+define void @v_bitcast_v32i16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x i16> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32f16_to_v32bf16:
+define void @v_bitcast_v32f16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x half> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16i32_to_v32bf16:
+define void @v_bitcast_v16i32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x i32> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v16f32_to_v32bf16:
+define void @v_bitcast_v16f32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <16 x float> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8f64_to_v32bf16:
+define void @v_bitcast_v8f64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x double> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x double> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v8i64_to_v32bf16:
+define void @v_bitcast_v8i64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x i64> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <8 x i64> %value to <32 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <32 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+
+
+
+; CHECK-LABEL: {{^}}v_bitcast_v32f32_to_v64bf16:
+define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x float> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x float> %value to <64 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v32i32_to_v64bf16:
+define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x i32> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <32 x i32> %value to <64 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64i16_to_v64bf16:
+define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x i16> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x i16> %value to <64 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64f16_to_v64bf16:
+define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x half> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x half> %value to <64 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v128i8_to_v64bf16:
+define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128 x i8> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <128 x i8> %value to <64 x bfloat>
+  br label %end
+
+end:
+  %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x bfloat> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64i16:
+define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x bfloat> %value to <64 x i16>
+  br label %end
+
+end:
+  %phi = phi <64 x i16> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x i16> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v64f16:
+define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x bfloat> %value to <64 x half>
+  br label %end
+
+end:
+  %phi = phi <64 x half> [zeroinitializer, %entry], [%cast, %if]
+  store <64 x half> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v128i8:
+define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x bfloat> %value to <128 x i8>
+  br label %end
+
+end:
+  %phi = phi <128 x i8> [zeroinitializer, %entry], [%cast, %if]
+  store <128 x i8> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16i64:
+define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x bfloat> %value to <16 x i64>
+  br label %end
+
+end:
+  %phi = phi <16 x i64> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x i64> %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_bitcast_v64bf16_to_v16f64:
+define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %if, label %end
+
+if:
+  %cast = bitcast <64 x bfloat> %value to <16 x double>
+  br label %end
+
+end:
+  %phi = phi <16 x double> [zeroinitializer, %entry], [%cast, %if]
+  store <16 x double> %phi, ptr addrspace(1) %out
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 758b11d957c463..79b9f8caea945b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -1,15 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN
-; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefixes=GFX7
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX8
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX9
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -check-prefix=GFX11
-
-; We only have storage-only BF16 support. We can load/store those values as we treat them as u16, but
-; we don't support operations on them. As such, codegen is expected to fail for any operation other
-; than simple load/stores.
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -6873,3 +6869,12797 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
   %fpext = fpext <32 x bfloat> %load to <32 x double>
   ret <32 x double> %fpext
 }
+
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fadd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
+; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
+; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
+; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
+; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
+; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
+; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fsub_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fmul_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fdiv_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
+; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fdiv_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fdiv bfloat %a, %b
+  ret bfloat %op
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define bfloat @v_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+define bfloat @v_fneg_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fneg bfloat %a
+  ret bfloat %op
+}
+
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_fneg_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = fneg bfloat %a
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fneg_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fneg_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fneg_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fneg_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  ret bfloat %op
+}
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_bitset1_b32 s0, 15
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_bitset1_b32 s0, 15
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_fneg_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_minnum_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
+}
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_maxnum_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
+}
+
+declare bfloat @llvm.sqrt.bf16(bfloat)
+
+define bfloat @v_sqrt_bf16(bfloat %a) {
+; GCN-LABEL: v_sqrt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0xf800000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x260
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_sqrt_f32_e32 v2, v0
+; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; GCN-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; GCN-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sqrt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX7-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; GFX7-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX7-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; GFX7-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x260
+; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sqrt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], -1, v1
+; GFX8-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], 1, v1
+; GFX8-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
+; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sqrt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v1
+; GFX9-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX9-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sqrt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v1
+; GFX10-NEXT:    v_fma_f32 v4, -v2, v1, v0
+; GFX10-NEXT:    v_fma_f32 v5, -v3, v1, v0
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, 0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sqrt_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sqrt_f32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 1, v1
+; GFX11-NEXT:    v_fma_f32 v4, -v2, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v5, -v3, v1, v0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
+; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
+; GFX11-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.ldexp.bf16.i32(bfloat, i32)
+
+define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
+; GCN-LABEL: v_ldexp_bf16_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ldexp_bf16_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ldexp_bf16_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ldexp_bf16_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ldexp_bf16_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_ldexp_bf16_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
+  ret bfloat %op
+}
+
+declare { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat)
+
+define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
+; GCN-LABEL: v_frexp_bf16_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; GCN-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GCN-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_frexp_bf16_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
+; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_frexp_bf16_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
+; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_frexp_bf16_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_frexp_bf16_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
+  ret { bfloat, i16 } %op
+}
+
+
+declare bfloat @llvm.log.bf16(bfloat)
+declare bfloat @llvm.log2.bf16(bfloat)
+declare bfloat @llvm.log10.bf16(bfloat)
+
+define bfloat @v_log_bf16(bfloat %a) {
+; GCN-LABEL: v_log_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GCN-NEXT:    s_mov_b32 s5, 0x7f800000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41b17218
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; GCN-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v3
+; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
+; GCN-NEXT:    v_add_f32_e32 v3, v5, v3
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s5
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x800000
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x3377d1cf
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX7-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x800000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_log_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3377d1cf
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_log_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x7f800000, |v0|
+; GFX10-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_log_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX11-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_log2_bf16(bfloat %a) {
+; GCN-LABEL: v_log2_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log2_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x800000
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log2_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x800000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log2_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_log_f32_e32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log2_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX10-NEXT:    v_log_f32_e32 v0, v0
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_log2_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log2.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_log10_bf16(bfloat %a) {
+; GCN-LABEL: v_log10_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GCN-NEXT:    s_mov_b32 s5, 0x7f800000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x411a209b
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; GCN-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v3
+; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
+; GCN-NEXT:    v_add_f32_e32 v3, v5, v3
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s5
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_log10_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x800000
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x3284fbcf
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX7-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_log10_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x800000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_log10_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_log_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3284fbcf
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_log10_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_log_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x7f800000, |v0|
+; GFX10-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_log10_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX11-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.log10.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.exp.bf16(bfloat)
+declare bfloat @llvm.exp2.bf16(bfloat)
+declare bfloat @llvm.exp10.bf16(bfloat)
+
+define bfloat @v_exp_bf16(bfloat %a) {
+; GCN-LABEL: v_exp_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
+; GCN-NEXT:    s_mov_b32 s5, 0x42b17218
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
+; GCN-NEXT:    v_rndne_f32_e32 v5, v2
+; GCN-NEXT:    v_mul_f32_e32 v6, 0x39a3b295, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v6
+; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+; GCN-NEXT:    v_exp_f32_e32 v2, v2
+; GCN-NEXT:    v_ldexp_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x32a5705f
+; GFX7-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_exp_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX7-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
+; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x42b17218
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
+; GFX8-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
+; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x42b17218
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x32a5705f
+; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x42b17218
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
+; GFX10-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
+; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; GFX11-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
+; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
+; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_exp2_bf16(bfloat %a) {
+; GCN-LABEL: v_exp2_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp2_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp2_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp2_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp2_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp2_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp2.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+define bfloat @v_exp10_bf16(bfloat %a) {
+; GCN-LABEL: v_exp10_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0xc23369f4
+; GCN-NEXT:    s_mov_b32 s5, 0x421a209b
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x40549000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
+; GCN-NEXT:    v_rndne_f32_e32 v5, v2
+; GCN-NEXT:    v_mul_f32_e32 v6, 0x3a2784bc, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v6
+; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GCN-NEXT:    v_add_f32_e32 v3, v4, v3
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+; GCN-NEXT:    v_exp_f32_e32 v2, v2
+; GCN-NEXT:    v_ldexp_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_exp10_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX7-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x33979a37
+; GFX7-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX7-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_exp_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX7-NEXT:    s_mov_b32 s4, 0xc23369f4
+; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x421a209b
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_exp10_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
+; GFX8-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0xc23369f4
+; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x421a209b
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_exp10_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x33979a37
+; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0xc23369f4
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x421a209b
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_exp10_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
+; GFX10-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
+; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_exp10_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
+; GFX11-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
+; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
+; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.exp10.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.ceil.bf16(bfloat)
+
+define bfloat @v_ceil_bf16(bfloat %a) {
+; GCN-LABEL: v_ceil_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_ceil_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_ceil_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ceil_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ceil_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ceil_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_ceil_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.ceil.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.trunc.bf16(bfloat)
+
+define bfloat @v_trunc_bf16(bfloat %a) {
+; GCN-LABEL: v_trunc_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_trunc_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_trunc_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_trunc_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_trunc_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_trunc_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.trunc.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.rint.bf16(bfloat)
+
+define bfloat @v_rint_bf16(bfloat %a) {
+; GCN-LABEL: v_rint_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_rint_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rint_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rint_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rint_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rint_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.rint.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.nearbyint.bf16(bfloat)
+
+define bfloat @v_nearbyint_bf16(bfloat %a) {
+; GCN-LABEL: v_nearbyint_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_nearbyint_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_nearbyint_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_nearbyint_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_nearbyint_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_nearbyint_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.round.bf16(bfloat)
+
+define bfloat @v_round_bf16(bfloat %a) {
+; GCN-LABEL: v_round_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v0
+; GCN-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_round_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX7-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX7-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX7-NEXT:    s_brev_b32 s4, -2
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_round_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_round_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_round_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_round_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.round.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.roundeven.bf16(bfloat)
+
+define bfloat @v_roundeven_bf16(bfloat %a) {
+; GCN-LABEL: v_roundeven_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_rndne_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_roundeven_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_roundeven_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.floor.bf16(bfloat)
+
+define bfloat @v_floor_bf16(bfloat %a) {
+; GCN-LABEL: v_floor_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_floor_f32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_floor_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_floor_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_floor_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_floor_f32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_floor_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_floor_f32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_floor_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_floor_f32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_floor_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_floor_f32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.floor.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.canonicalize.bf16(bfloat)
+
+define bfloat @v_canonicalize_bf16(bfloat %a) {
+; GCN-LABEL: v_canonicalize_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_canonicalize_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_canonicalize_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_canonicalize_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_canonicalize_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_canonicalize_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
+  ret bfloat %op
+}
+
+declare bfloat @llvm.arithmetic.fence.bf16(bfloat)
+
+; FIXME: Promotion broken
+; define bfloat @v_arithmetic_fence_bf16(bfloat %a) {
+;   %op = call bfloat @llvm.arithmetic.fence.bf16(bfloat %a)
+;   ret bfloat %op
+; }
+
+define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_false_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_false_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_false_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_false_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_false_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_false_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp false bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oeq_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oeq_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oeq_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oeq_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oeq_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_oeq_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp oeq bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ogt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ogt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ogt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ogt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ogt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ogt_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ogt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_oge_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_oge_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_oge_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_oge_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_oge_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_oge_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp oge bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_olt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_olt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_olt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_olt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_olt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_olt_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp olt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ole_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ole_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ole_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ole_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ole_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ole_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ole bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_one_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_one_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_one_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_one_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_one_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_one_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp one bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uno_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uno_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uno_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uno_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uno_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_uno_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp uno bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ueq_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ueq_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ueq_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ueq_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ueq_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ueq_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ueq bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ugt_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ugt_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ugt_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ugt_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ugt_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ugt_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ugt bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_uge_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_uge_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_uge_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_uge_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_uge_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_uge_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp uge bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ult_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ult_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ult_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ult_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ult_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ult_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ult bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_ule_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_ule_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_ule_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_ule_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_ule_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_ule_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp ule bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_une_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_une_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_une_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_une_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_une_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_une_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp une bfloat %a, %b
+  ret i1 %op
+}
+
+define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fcmp_true_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fcmp_true_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fcmp_true_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fcmp_true_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fcmp_true_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fcmp_true_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fcmp true bfloat %a, %b
+  ret i1 %op
+}
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+; GCN-LABEL: v_copysign_bf16_s_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s4, 0x80000000
+; GCN-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:    v_or_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_s_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_and_b32 s4, s4, 0x80000000
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_s_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s4, s4, 0x80000000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_s_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
+; GFX9-NEXT:    s_and_b32 s4, s4, 0x80000000
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_s_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10-NEXT:    s_and_b32 s4, s4, 0x80000000
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_s_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_s_bf16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_bfe_u32 s4, s4, 0xf0010
+; GCN-NEXT:    v_or_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_s_bf16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_bfe_u32 s4, s4, 0xf0010
+; GFX7-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_s_bf16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_s_bf16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_s_bf16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s4
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_s_bf16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
+; GCN-LABEL: v_copysign_bf16_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = fptrunc float %sign.f32 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
+; GCN-LABEL: v_copysign_bf16_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = fptrunc double %sign.f64 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
+; GCN-LABEL: v_copysign_bf16_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = bitcast half %sign.f16 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s1, s1, 0x80000000
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
+; GCN-LABEL: s_copysign_bf16_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s1, s1, 0x80000000
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = fptrunc float %sign.f32 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
+; GCN-LABEL: s_copysign_bf16_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b32 s1, s2, 0x80000000
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s1, s2, 0x80000000
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX10-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX11-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = fptrunc double %sign.f64 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
+; GCN-LABEL: s_copysign_bf16_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    v_or_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = bitcast half %sign.f16 to bfloat
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare float @llvm.copysign.f32(float, float)
+
+define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
+; GCN-LABEL: v_copysign_f32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_f32_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_brev_b32 s4, -2
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_f32_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f32_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_f32_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f32_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = fpext bfloat %sign.bf16 to float
+  %op = call float @llvm.copysign.f32(float %mag, float %sign)
+  ret float %op
+}
+
+define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
+; GCN-LABEL: s_copysign_f32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_brev_b32 s2, -2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_f32_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_brev_b32 s2, -2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_f32_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_brev_b32 s2, -2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f32_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_f32_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f32_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = fpext bfloat %sign.bf16 to float
+  %op = call float @llvm.copysign.f32(float %mag, float %sign)
+  %cast = bitcast float %op to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readlane
+}
+
+declare half @llvm.copysign.f16(half, half)
+
+define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
+; GCN-LABEL: v_copysign_f16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_f16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_brev_b32 s4, -2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_f16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_f16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = bitcast bfloat %sign.bf16 to half
+  %op = call half @llvm.copysign.f16(half %mag, half %sign)
+  ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
+; GCN-LABEL: s_copysign_f16_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, s1
+; GCN-NEXT:    s_brev_b32 s0, -2
+; GCN-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_f16_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX7-NEXT:    s_brev_b32 s0, -2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_f16_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_f16_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = bitcast bfloat %sign.bf16 to half
+  %op = call half @llvm.copysign.f16(half %mag, half %sign)
+  %cast = bitcast half %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+declare double @llvm.copysign.f64(double, double)
+
+define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
+; GCN-LABEL: v_copysign_f64_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_f64_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_brev_b32 s4, -2
+; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_f64_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f64_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_f64_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f64_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sign = fpext bfloat %sign.bf16 to double
+  %op = call double @llvm.copysign.f64(double %mag, double %sign)
+  ret double %op
+}
+
+define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
+; GCN-LABEL: s_copysign_f64_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_brev_b32 s3, -2
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_f64_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_brev_b32 s3, -2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_f64_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_brev_b32 s3, -2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f64_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_brev_b32 s3, -2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_f64_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f64_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sign = fpext bfloat %sign.bf16 to double
+  %op = call double @llvm.copysign.f64(double %mag, double %sign)
+  %cast = bitcast double %op to <2 x i32>
+  %cast.0 = extractelement <2 x i32> %cast, i32 0
+  %cast.1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1)
+  %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
+  ret <2 x i32> %ins.1
+}
+
+define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
+; GCN-LABEL: v_fptosi_bf16_to_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_bf16_to_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_bf16_to_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_bf16_to_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_bf16_to_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_bf16_to_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi bfloat %x to i16
+  ret i16 %op
+}
+
+define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <2 x bfloat> %x to <2 x i16>
+  ret <2 x i16> %op
+}
+
+define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX7-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <3 x bfloat> %x to <3 x i16>
+  ret <3 x i16> %op
+}
+
+define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
+; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <4 x bfloat> %x to <4 x i16>
+  ret <4 x i16> %op
+}
+
+define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
+; GCN-LABEL: v_fptosi_bf16_to_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_bf16_to_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_bf16_to_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_bf16_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_bf16_to_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_bf16_to_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi bfloat %x to i32
+  ret i32 %op
+}
+
+define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v2bf16_to_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v1
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <2 x bfloat> %x to <2 x i32>
+  ret <2 x i32> %op
+}
+
+define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v4, v2
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v3bf16_to_v3i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <3 x bfloat> %x to <3 x i32>
+  ret <3 x i32> %op
+}
+
+define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v4, v2
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v2
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v4
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v4bf16_to_v4i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v2
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v4
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <4 x bfloat> %x to <4 x i32>
+  ret <4 x i32> %op
+}
+
+define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
+; GCN-LABEL: v_fptosi_bf16_to_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
+; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
+; GCN-NEXT:    v_floor_f32_e32 v1, v1
+; GCN-NEXT:    v_fma_f32 v0, v1, s5, |v0|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_bf16_to_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX7-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX7-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_bf16_to_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX8-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v1, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX8-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_bf16_to_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_bf16_to_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_bf16_to_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi bfloat %x to i64
+  ret i64 %op
+}
+
+define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
+; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mul_f32_e64 v2, |v0|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GCN-NEXT:    v_mul_f32_e64 v4, |v1|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; GCN-NEXT:    v_floor_f32_e32 v2, v2
+; GCN-NEXT:    v_floor_f32_e32 v4, v4
+; GCN-NEXT:    v_fma_f32 v0, v2, s5, |v0|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_fma_f32 v1, v4, s5, |v1|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v5
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GCN-NEXT:    v_xor_b32_e32 v6, v1, v5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v3, vcc
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v6, v5
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX7-NEXT:    v_mul_f32_e64 v2, |v0|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX7-NEXT:    v_fma_f32 v3, v2, s5, |v0|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX7-NEXT:    v_xor_b32_e32 v0, v3, v4
+; GFX7-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, |v3|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v1, v1
+; GFX7-NEXT:    v_fma_f32 v5, v1, s5, |v3|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX7-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX7-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX7-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX8-NEXT:    v_mul_f32_e64 v2, |v1|, s4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_floor_f32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX8-NEXT:    v_trunc_f32_e32 v4, v0
+; GFX8-NEXT:    v_fma_f32 v3, v2, s5, |v1|
+; GFX8-NEXT:    v_mul_f32_e64 v0, |v4|, s4
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_floor_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_fma_f32 v5, v0, s5, |v4|
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v3, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX8-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, s4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v0
+; GFX9-NEXT:    v_fma_f32 v3, v2, s5, |v1|
+; GFX9-NEXT:    v_mul_f32_e64 v0, |v4|, s4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_floor_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_fma_f32 v5, v0, s5, |v4|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v3, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_floor_f32_e32 v3, v3
+; GFX10-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v1|
+; GFX10-NEXT:    v_fma_f32 v5, 0xcf800000, v3, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v1
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v2bf16_to_v2i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    v_floor_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v1|
+; GFX11-NEXT:    v_fma_f32 v5, 0xcf800000, v3, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v1
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <2 x bfloat> %x to <2 x i64>
+  ret <2 x i64> %op
+}
+
+define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
+; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_f32_e64 v3, |v0|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GCN-NEXT:    v_mul_f32_e64 v5, |v1|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCN-NEXT:    v_mul_f32_e64 v7, |v2|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-NEXT:    v_floor_f32_e32 v3, v3
+; GCN-NEXT:    v_floor_f32_e32 v5, v5
+; GCN-NEXT:    v_floor_f32_e32 v7, v7
+; GCN-NEXT:    v_fma_f32 v0, v3, s5, |v0|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_fma_f32 v1, v5, s5, |v1|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT:    v_fma_f32 v2, v7, s5, |v2|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_xor_b32_e32 v5, v5, v6
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_xor_b32_e32 v7, v7, v8
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT:    v_xor_b32_e32 v9, v1, v6
+; GCN-NEXT:    v_xor_b32_e32 v10, v2, v8
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v9, v6
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v5, v6, vcc
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v10, v8
+; GCN-NEXT:    v_subb_u32_e32 v5, vcc, v7, v8, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX7-NEXT:    v_mul_f32_e64 v3, |v0|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v3, v3
+; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX7-NEXT:    v_fma_f32 v4, v3, s5, |v0|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX7-NEXT:    v_xor_b32_e32 v0, v4, v5
+; GFX7-NEXT:    v_trunc_f32_e32 v4, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, |v4|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v1, v1
+; GFX7-NEXT:    v_fma_f32 v6, v1, s5, |v4|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX7-NEXT:    v_xor_b32_e32 v3, v3, v5
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v3, v5, vcc
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_xor_b32_e32 v5, v6, v3
+; GFX7-NEXT:    v_trunc_f32_e32 v6, v2
+; GFX7-NEXT:    v_mul_f32_e64 v2, |v6|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v2, v2
+; GFX7-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX7-NEXT:    v_fma_f32 v7, v2, s5, |v6|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v5, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v6
+; GFX7-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v4, v7, v5
+; GFX7-NEXT:    v_xor_b32_e32 v6, v8, v5
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
+; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX8-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v3, v3
+; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX8-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX8-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX8-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v2
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v3, v2, vcc
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX8-NEXT:    v_mul_f32_e64 v5, |v1|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v5, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, v7, v3
+; GFX8-NEXT:    v_fma_f32 v7, v5, s5, |v1|
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX8-NEXT:    v_xor_b32_e32 v4, v8, v3
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v1
+; GFX8-NEXT:    v_xor_b32_e32 v5, v5, v1
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v0, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v3, v2, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX9-NEXT:    v_mul_f32_e64 v5, |v1|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v5, v5
+; GFX9-NEXT:    v_xor_b32_e32 v2, v7, v3
+; GFX9-NEXT:    v_fma_f32 v7, v5, s5, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_xor_b32_e32 v4, v8, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v2|
+; GFX10-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v0|
+; GFX10-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v1|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v3, v3
+; GFX10-NEXT:    v_floor_f32_e32 v4, v4
+; GFX10-NEXT:    v_floor_f32_e32 v6, v6
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v3, |v2|
+; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v4, |v0|
+; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v6, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX10-NEXT:    v_xor_b32_e32 v9, v0, v7
+; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GFX10-NEXT:    v_xor_b32_e32 v10, v1, v8
+; GFX10-NEXT:    v_xor_b32_e32 v6, v6, v8
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v9, v7
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v10, v8
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v3bf16_to_v3i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v2|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v0|
+; GFX11-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v1|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX11-NEXT:    v_floor_f32_e32 v3, v3
+; GFX11-NEXT:    v_floor_f32_e32 v4, v4
+; GFX11-NEXT:    v_floor_f32_e32 v6, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v3, |v2|
+; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v4, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v6, |v1|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX11-NEXT:    v_xor_b32_e32 v9, v0, v7
+; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GFX11-NEXT:    v_xor_b32_e32 v10, v1, v8
+; GFX11-NEXT:    v_xor_b32_e32 v6, v6, v8
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v9, v7
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v10, v8
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <3 x bfloat> %x to <3 x i64>
+  ret <3 x i64> %op
+}
+
+define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
+; GCN-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_mov_b32 s4, 0x2f800000
+; GCN-NEXT:    s_mov_b32 s5, 0xcf800000
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_trunc_f32_e32 v0, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_f32_e64 v4, |v0|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-NEXT:    v_mul_f32_e64 v6, |v1|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GCN-NEXT:    v_mul_f32_e64 v8, |v2|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v2
+; GCN-NEXT:    v_mul_f32_e64 v10, |v3|, s4
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; GCN-NEXT:    v_floor_f32_e32 v4, v4
+; GCN-NEXT:    v_floor_f32_e32 v6, v6
+; GCN-NEXT:    v_floor_f32_e32 v8, v8
+; GCN-NEXT:    v_floor_f32_e32 v10, v10
+; GCN-NEXT:    v_fma_f32 v0, v4, s5, |v0|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT:    v_fma_f32 v1, v6, s5, |v1|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT:    v_fma_f32 v2, v8, s5, |v2|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GCN-NEXT:    v_fma_f32 v3, v10, s5, |v3|
+; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v5
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_xor_b32_e32 v6, v6, v7
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_xor_b32_e32 v8, v8, v9
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_xor_b32_e32 v10, v10, v11
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
+; GCN-NEXT:    v_xor_b32_e32 v12, v1, v7
+; GCN-NEXT:    v_xor_b32_e32 v13, v2, v9
+; GCN-NEXT:    v_xor_b32_e32 v14, v3, v11
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v5, vcc
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v12, v7
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v6, v7, vcc
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v13, v9
+; GCN-NEXT:    v_subb_u32_e32 v5, vcc, v8, v9, vcc
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v14, v11
+; GCN-NEXT:    v_subb_u32_e32 v7, vcc, v10, v11, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX7-NEXT:    v_mul_f32_e64 v4, |v0|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v4, v4
+; GFX7-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX7-NEXT:    v_fma_f32 v5, v4, s5, |v0|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX7-NEXT:    v_xor_b32_e32 v0, v5, v6
+; GFX7-NEXT:    v_trunc_f32_e32 v5, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, |v5|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v1, v1
+; GFX7-NEXT:    v_fma_f32 v7, v1, s5, |v5|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX7-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v8, v1
+; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v4, v6, vcc
+; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_xor_b32_e32 v6, v7, v4
+; GFX7-NEXT:    v_trunc_f32_e32 v7, v2
+; GFX7-NEXT:    v_mul_f32_e64 v2, |v7|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v2, v2
+; GFX7-NEXT:    v_xor_b32_e32 v5, v8, v4
+; GFX7-NEXT:    v_fma_f32 v8, v2, s5, |v7|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v9, v8
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v6, v4
+; GFX7-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX7-NEXT:    v_subb_u32_e32 v8, vcc, v5, v4, vcc
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
+; GFX7-NEXT:    v_mul_f32_e64 v7, |v3|, s4
+; GFX7-NEXT:    v_floor_f32_e32 v7, v7
+; GFX7-NEXT:    v_xor_b32_e32 v4, v9, v5
+; GFX7-NEXT:    v_fma_f32 v9, v7, s5, |v3|
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX7-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX7-NEXT:    v_xor_b32_e32 v6, v10, v5
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v6, v9, v3
+; GFX7-NEXT:    v_xor_b32_e32 v7, v7, v3
+; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
+; GFX7-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, v8
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX8-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v3, v3
+; GFX8-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX8-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX8-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v0, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX8-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v2
+; GFX8-NEXT:    v_subb_u32_e32 v8, vcc, v3, v2, vcc
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX8-NEXT:    v_mul_f32_e64 v6, |v5|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v6, v6
+; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX8-NEXT:    v_fma_f32 v7, v6, s5, |v5|
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v4, v7, v5
+; GFX8-NEXT:    v_mul_f32_e64 v7, |v1|, s4
+; GFX8-NEXT:    v_floor_f32_e32 v7, v7
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_fma_f32 v9, v7, s5, |v1|
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v6, v9, v1
+; GFX8-NEXT:    v_xor_b32_e32 v7, v7, v1
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v1
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v3, |v2|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s5, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v4, v3, s5, |v2|
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e64 v0, |v5|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v0, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_fma_f32 v6, v0, s5, |v5|
+; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX9-NEXT:    v_mul_f32_e64 v6, |v5|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v6, v6
+; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_fma_f32 v7, v6, s5, |v5|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, v7, v5
+; GFX9-NEXT:    v_mul_f32_e64 v7, |v1|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v7, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_fma_f32 v9, v7, s5, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v6, v9, v1
+; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_trunc_f32_e32 v4, v1
+; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v2|
+; GFX10-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v0|
+; GFX10-NEXT:    v_mul_f32_e64 v8, 0x2f800000, |v3|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GFX10-NEXT:    v_mul_f32_e64 v9, 0x2f800000, |v4|
+; GFX10-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-NEXT:    v_floor_f32_e32 v6, v6
+; GFX10-NEXT:    v_floor_f32_e32 v8, v8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v9, v9
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v2|
+; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v6, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_f32 v3, 0xcf800000, v8, |v3|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_fma_f32 v11, 0xcf800000, v9, |v4|
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, v0, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX10-NEXT:    v_xor_b32_e32 v6, v6, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v4, v12, v10
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, v7
+; GFX10-NEXT:    v_xor_b32_e32 v5, v8, v10
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v6, v11, v13
+; GFX10-NEXT:    v_xor_b32_e32 v7, v9, v13
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v10
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, v13
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fptosi_v4bf16_to_v4i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_trunc_f32_e32 v4, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v2|
+; GFX11-NEXT:    v_mul_f32_e64 v6, 0x2f800000, |v0|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e64 v8, 0x2f800000, |v3|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GFX11-NEXT:    v_mul_f32_e64 v9, 0x2f800000, |v4|
+; GFX11-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-NEXT:    v_floor_f32_e32 v6, v6
+; GFX11-NEXT:    v_floor_f32_e32 v8, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX11-NEXT:    v_floor_f32_e32 v9, v9
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v2|
+; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v6, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v3, 0xcf800000, v8, |v3|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT:    v_fma_f32 v11, 0xcf800000, v9, |v4|
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v12, v3
+; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v7
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX11-NEXT:    v_xor_b32_e32 v6, v6, v7
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v5
+; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v4, v12, v10
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, v7
+; GFX11-NEXT:    v_xor_b32_e32 v5, v8, v10
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v6, v11, v13
+; GFX11-NEXT:    v_xor_b32_e32 v7, v9, v13
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, v10
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, v13
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fptosi <4 x bfloat> %x to <4 x i64>
+  ret <4 x i64> %op
+}
+
+define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
+; GCN-LABEL: v_sitofp_i16_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_i16_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_i16_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_i16_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_i16_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_i16_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp i16 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+; GCN-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <2 x i16> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+; GCN-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <3 x i16> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+; GCN-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <4 x i16> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
+; GCN-LABEL: v_sitofp_i32_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_i32_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_i32_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_i32_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_i32_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_i32_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp i32 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+; GCN-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <2 x i32> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+; GCN-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <3 x i32> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+; GCN-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <4 x i32> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
+; GCN-LABEL: v_sitofp_i64_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GCN-NEXT:    v_ffbh_i32_e32 v3, v1
+; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GCN-NEXT:    v_min_u32_e32 v2, v3, v2
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_i64_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
+; GFX7-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_i64_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
+; GFX8-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_i64_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_i64_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_i64_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp i64 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+; GCN-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_i32_e32 v4, v3
+; GCN-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GCN-NEXT:    v_ffbh_i32_e32 v6, v1
+; GCN-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GCN-NEXT:    v_min_u32_e32 v4, v4, v5
+; GCN-NEXT:    v_min_u32_e32 v5, v6, v7
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 32, v5
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX7-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, -1, v4
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GFX7-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
+; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v4
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v4, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v3
+; GFX10-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 32, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 32, v5
+; GFX10-NEXT:    v_min_u32_e32 v4, v6, v4
+; GFX10-NEXT:    v_min_u32_e32 v5, v7, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v4, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v6, v3
+; GFX11-NEXT:    v_cls_i32_e32 v7, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 32, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v4, v6, v4
+; GFX11-NEXT:    v_min_u32_e32 v5, v7, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <2 x i64> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+; GCN-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_i32_e32 v6, v5
+; GCN-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GCN-NEXT:    v_ffbh_i32_e32 v8, v3
+; GCN-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GCN-NEXT:    v_ffbh_i32_e32 v10, v1
+; GCN-NEXT:    v_xor_b32_e32 v11, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, -1, v10
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT:    v_min_u32_e32 v6, v6, v7
+; GCN-NEXT:    v_min_u32_e32 v7, v8, v9
+; GCN-NEXT:    v_min_u32_e32 v8, v10, v11
+; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 32, v6
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
+; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 32, v7
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
+; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v3, v1, v6
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v7
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX7-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GFX7-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
+; GFX7-NEXT:    v_xor_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
+; GFX7-NEXT:    v_ffbh_i32_e32 v6, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, -1, v6
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GFX7-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
+; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
+; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v6, v4, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_min_u32_e32 v6, v4, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v7, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v11, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX11-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v6, v5
+; GFX11-NEXT:    v_cls_i32_e32 v10, v3
+; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX11-NEXT:    v_cls_i32_e32 v11, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX11-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v6, v6, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <3 x i64> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+; GCN-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_i32_e32 v8, v7
+; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GCN-NEXT:    v_ffbh_i32_e32 v10, v5
+; GCN-NEXT:    v_xor_b32_e32 v11, v4, v5
+; GCN-NEXT:    v_ffbh_i32_e32 v12, v3
+; GCN-NEXT:    v_xor_b32_e32 v13, v2, v3
+; GCN-NEXT:    v_ffbh_i32_e32 v14, v1
+; GCN-NEXT:    v_xor_b32_e32 v15, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, -1, v10
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, -1, v12
+; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, -1, v14
+; GCN-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; GCN-NEXT:    v_min_u32_e32 v8, v8, v9
+; GCN-NEXT:    v_min_u32_e32 v9, v10, v11
+; GCN-NEXT:    v_min_u32_e32 v10, v12, v13
+; GCN-NEXT:    v_min_u32_e32 v11, v14, v15
+; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
+; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v9
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 32, v9
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 32, v10
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v11
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT:    v_min_u32_e32 v6, 1, v6
+; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v6, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v4, v1, v8
+; GCN-NEXT:    v_ldexp_f32_e32 v3, v3, v9
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v10
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v11
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GFX7-NEXT:    v_ffbh_i32_e32 v8, v7
+; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
+; GFX7-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX7-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
+; GFX7-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX7-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 32, v8
+; GFX7-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, -1, v8
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
+; GFX7-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v6, v6
+; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v8
+; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v8
+; GFX7-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v6, v6, v7
+; GFX7-NEXT:    v_ffbh_i32_e32 v7, v3
+; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, -1, v7
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v8
+; GFX7-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
+; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 32, v5
+; GFX7-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v7
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
+; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
+; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v6, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
+; GFX8-NEXT:    v_ffbh_i32_e32 v7, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, -1, v7
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
+; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v7, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
+; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
+; GFX10-NEXT:    v_xor_b32_e32 v13, v2, v3
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v15, v0, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_ffbh_i32_e32 v14, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v10
+; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
+; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 32, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT:    v_min_u32_e32 v10, v12, v10
+; GFX10-NEXT:    v_min_u32_e32 v11, v14, v13
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX11-NEXT:    v_cls_i32_e32 v9, v5
+; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX11-NEXT:    v_cls_i32_e32 v10, v7
+; GFX11-NEXT:    v_xor_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX11-NEXT:    v_xor_b32_e32 v14, v0, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX11-NEXT:    v_cls_i32_e32 v12, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX11-NEXT:    v_cls_i32_e32 v13, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
+; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
+; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v1, v4, v6
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v10
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v6
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = sitofp <4 x i64> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
+; GCN-LABEL: v_uitofp_i16_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_i16_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_i16_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i16_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_i16_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i16_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp i16 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
+; GCN-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <2 x i16> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
+; GCN-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <3 x i16> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
+; GCN-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <4 x i16> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
+; GCN-LABEL: v_uitofp_i32_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_i32_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_i32_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i32_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_i32_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i32_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp i32 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
+; GCN-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <2 x i32> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
+; GCN-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <3 x i32> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
+; GCN-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <4 x i32> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
+; GCN-LABEL: v_uitofp_i64_to_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_u32_e32 v2, v1
+; GCN-NEXT:    v_min_u32_e32 v2, 32, v2
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_i64_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX7-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_i64_to_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX8-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i64_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_i64_to_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i64_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp i64 %x to bfloat
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
+; GCN-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_u32_e32 v4, v3
+; GCN-NEXT:    v_ffbh_u32_e32 v5, v1
+; GCN-NEXT:    v_min_u32_e32 v4, 32, v4
+; GCN-NEXT:    v_min_u32_e32 v5, 32, v5
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v5
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 32, v5
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX7-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v4
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 32, v4
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v4
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <2 x i64> %x to <2 x bfloat>
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
+; GCN-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_u32_e32 v6, v5
+; GCN-NEXT:    v_ffbh_u32_e32 v7, v3
+; GCN-NEXT:    v_ffbh_u32_e32 v8, v1
+; GCN-NEXT:    v_min_u32_e32 v6, 32, v6
+; GCN-NEXT:    v_min_u32_e32 v7, 32, v7
+; GCN-NEXT:    v_min_u32_e32 v8, 32, v8
+; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 32, v6
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
+; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 32, v7
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
+; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v3, v1, v6
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v7
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX7-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v6
+; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
+; GFX7-NEXT:    v_ffbh_u32_e32 v6, v3
+; GFX7-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v6
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ffbh_u32_e32 v6, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v6, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v7, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <3 x i64> %x to <3 x bfloat>
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
+; GCN-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_ffbh_u32_e32 v8, v7
+; GCN-NEXT:    v_ffbh_u32_e32 v9, v5
+; GCN-NEXT:    v_ffbh_u32_e32 v10, v3
+; GCN-NEXT:    v_ffbh_u32_e32 v11, v1
+; GCN-NEXT:    v_min_u32_e32 v8, 32, v8
+; GCN-NEXT:    v_min_u32_e32 v9, 32, v9
+; GCN-NEXT:    v_min_u32_e32 v10, 32, v10
+; GCN-NEXT:    v_min_u32_e32 v11, 32, v11
+; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 32, v8
+; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v9
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 32, v9
+; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 32, v10
+; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v11
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 32, v11
+; GCN-NEXT:    v_min_u32_e32 v6, 1, v6
+; GCN-NEXT:    v_min_u32_e32 v4, 1, v4
+; GCN-NEXT:    v_min_u32_e32 v2, 1, v2
+; GCN-NEXT:    v_min_u32_e32 v0, 1, v0
+; GCN-NEXT:    v_or_b32_e32 v6, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v2, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GCN-NEXT:    v_ldexp_f32_e32 v4, v1, v8
+; GCN-NEXT:    v_ldexp_f32_e32 v3, v3, v9
+; GCN-NEXT:    v_ldexp_f32_e32 v1, v2, v10
+; GCN-NEXT:    v_ldexp_f32_e32 v0, v0, v11
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX7-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX7-NEXT:    v_lshl_b64 v[6:7], v[6:7], v8
+; GFX7-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v6, v6
+; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 32, v8
+; GFX7-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX7-NEXT:    v_ldexp_f32_e32 v6, v6, v7
+; GFX7-NEXT:    v_ffbh_u32_e32 v7, v3
+; GFX7-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX7-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
+; GFX7-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX7-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX7-NEXT:    v_lshl_b64 v[4:5], v[4:5], v8
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX7-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX7-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX7-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v8
+; GFX7-NEXT:    v_ldexp_f32_e32 v4, v4, v5
+; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 32, v7
+; GFX7-NEXT:    v_ldexp_f32_e32 v1, v2, v5
+; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 32, v3
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, v6
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
+; GFX8-NEXT:    v_ffbh_u32_e32 v7, v3
+; GFX8-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v3
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v11, v1
+; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v4
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v5
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = uitofp <4 x i64> %x to <4 x bfloat>
+  ret <4 x bfloat> %op
+}
+
+define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, bfloat %a, bfloat %b
+  ret bfloat %op
+}
+
+define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_lhs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_lhs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_lhs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_lhs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_lhs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_fneg_lhs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = fneg bfloat %a
+  %op = select i1 %cond, bfloat %neg.a, bfloat %b
+  ret bfloat %op
+}
+
+define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; GCN-LABEL: v_select_fneg_rhs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_fneg_rhs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_fneg_rhs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_fneg_rhs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_fneg_rhs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_fneg_rhs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.b = fneg bfloat %b
+  %op = select i1 %cond, bfloat %a, bfloat %neg.b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_select_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %op
+}
+
+define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  ret <2 x bfloat> %op
+}
+
+define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
+; GCN-LABEL: s_select_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_select_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq i32 %c, 0
+  %op = select i1 %cond, bfloat %a, bfloat %b
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, i32 %c) {
+; GCN-LABEL: s_select_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v1, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_select_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, s1, v2 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq i32 %c, 0
+  %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  %cast = bitcast <2 x bfloat> %op to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readlane
+}
+
+define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x i32> %c) {
+; GCN-LABEL: s_vselect_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_vselect_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_vselect_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_vselect_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_vselect_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_vselect_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, s1, v3 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq <2 x i32> %c, zeroinitializer
+  %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
+  %cast = bitcast <2 x bfloat> %op to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readlane
+}
+
+declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fma_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fma_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
+; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fma_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_fma_f32 v1, v1, v4, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v3, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_fmac_f32 v5, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fma_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_fma_f32 v2, v2, v6, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_fma_f32 v1, v1, v5, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v4, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_and_b32 v7, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v4, v0, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v7, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+  ret <4 x bfloat> %op
+}
+
+declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
+declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+
+define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
+; GCN-LABEL: v_fmuladd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v8
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v7
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v6
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
+; GCN-LABEL: v_fmuladd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmuladd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmuladd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmuladd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmuladd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v3, v5, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmuladd_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_mul_f32 v6, v7, v6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v2, v6, v9 :: v_dual_mul_f32 v3, v8, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
+  ret <4 x bfloat> %op
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 14d98c84811971..3a6ecc21494893 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1870,5 +1870,149 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
   ret void
 }
 
+define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
+; GFX9-LABEL: void_func_bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v2bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3bf16_inreg(<3 x bfloat> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v3bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4bf16_inreg(<4 x bfloat> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v4bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8bf16_inreg(<8 x bfloat> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v8bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16bf16_inreg(<16 x bfloat> inreg %arg0) #0 {
+; GFX9-LABEL: void_func_v16bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind noinline }
+
+
+
+
+
+
+

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 01dcc26566663b..de9e320a363a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2761,8 +2761,8 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 }
 
 ; FIXME: Different ext load types on CI vs. VI
-define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
-; CI-LABEL: void_func_v32i32_i1_i8_i16:
+define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4, bfloat %arg5) #0 {
+; CI-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -2775,12 +2775,13 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2789,19 +2790,22 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
-; CI-NEXT:    v_and_b32_e32 v0, 1, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_and_b32_e32 v0, 1, v17
+; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v20, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: void_func_v32i32_i1_i8_i16:
+; VI-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -2814,12 +2818,13 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2828,18 +2833,21 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, 1, v20
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; VI-NEXT:    v_and_b32_e32 v0, 1, v16
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: void_func_v32i32_i1_i8_i16:
+; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -2852,13 +2860,13 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2868,26 +2876,30 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_i1_i8_i16:
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
@@ -2900,7 +2912,8 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v33
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2912,13 +2925,15 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
@@ -2926,6 +2941,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   store volatile i8 %arg2, ptr addrspace(1) undef
   store volatile i16 %arg3, ptr addrspace(1) undef
   store volatile half %arg4, ptr addrspace(1) undef
+  store volatile bfloat %arg5, ptr addrspace(1) undef
   ret void
 }
 
@@ -3071,8 +3087,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ret void
 }
 
-define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
-; CI-LABEL: void_func_v32i32_v2i16_v2f16:
+define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3087,32 +3103,39 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v17
+; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v12, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16:
+; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3125,10 +3148,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3141,9 +3165,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16:
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3156,11 +3182,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3174,18 +3200,21 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_v2i16_v2f16:
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -3202,16 +3231,20 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
   store volatile <2 x half> %arg2, ptr addrspace(1) undef
+  store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
   ret void
 }
 
@@ -4566,4 +4599,219 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ret void
 }
 
+
+define void @void_func_bf16(bfloat %arg0) #0 {
+; CIGFX89-LABEL: void_func_bf16:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 {
+; CI-LABEL: void_func_v2bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v2bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 {
+; CI-LABEL: void_func_v3bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_short v1, off, s[4:7], 0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v3bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v1, off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 {
+; CI-LABEL: void_func_v4bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v4bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 {
+; CI-LABEL: void_func_v8bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; CI-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; CI-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; CI-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v8bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 {
+; CI-LABEL: void_func_v16bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; CI-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; CI-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; CI-NEXT:    v_alignbit_b32 v14, v0, v14, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; CI-NEXT:    v_alignbit_b32 v13, v0, v12, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; CI-NEXT:    v_alignbit_b32 v12, v0, v10, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; CI-NEXT:    v_alignbit_b32 v11, v0, v8, 16
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; CI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v16bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x bfloat> %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index e7d86c0c178e97..d0a8e539056529 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -2354,4 +2354,375 @@ define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0)
   ret void
 }
 
+define bfloat @bf16_func_void() #0 {
+; CI-LABEL: bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: bf16_func_void:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: bf16_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_short_d16_hi v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) undef
+  ret bfloat %val
+}
+
+define <2 x bfloat> @v2bf16_func_void() #0 {
+; CI-LABEL: v2bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v2bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x bfloat>, ptr addrspace(1) undef
+  ret <2 x bfloat> %val
+}
+
+define <3 x bfloat> @v3bf16_func_void() #0 {
+; CI-LABEL: v3bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v3bf16_func_void:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v3bf16_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v2, v2, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v2, 0xffff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <3 x bfloat>, ptr addrspace(1) undef
+  ret <3 x bfloat> %val
+}
+
+define <4 x bfloat> @v4bf16_func_void() #0 {
+; CI-LABEL: v4bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v4bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <4 x bfloat>, ptr addrspace(1) undef
+  ret <4 x bfloat> %val
+}
+
+define <6 x bfloat> @v6bf16_func_void() #0 {
+; CI-LABEL: v6bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx3 v[3:5], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v6bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v6bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <6 x bfloat>, ptr addrspace(1) undef
+  ret <6 x bfloat> %val
+}
+
+define <8 x bfloat> @v8bf16_func_void() #0 {
+; CI-LABEL: v8bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; CI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v8bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v8bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <8 x bfloat>, ptr addrspace(1) undef
+  ret <8 x bfloat> %val
+}
+
+define <16 x bfloat> @v16bf16_func_void() #0 {
+; CI-LABEL: v16bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; CI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v0
+; CI-NEXT:    v_mov_b32_e32 v9, v1
+; CI-NEXT:    v_mov_b32_e32 v10, v2
+; CI-NEXT:    v_mov_b32_e32 v11, v3
+; CI-NEXT:    v_mov_b32_e32 v12, v4
+; CI-NEXT:    v_mov_b32_e32 v13, v5
+; CI-NEXT:    v_mov_b32_e32 v14, v6
+; CI-NEXT:    v_mov_b32_e32 v15, v7
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v16bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v4, v0
+; GFX89-NEXT:    v_mov_b32_e32 v5, v1
+; GFX89-NEXT:    v_mov_b32_e32 v6, v2
+; GFX89-NEXT:    v_mov_b32_e32 v7, v3
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v16bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
+; GFX11-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <16 x bfloat>, ptr addrspace(1) undef
+  ret <16 x bfloat> %val
+}
+
+define <32 x bfloat> @v32bf16_func_void() #0 {
+; CI-LABEL: v32bf16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; CI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; CI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; CI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v0
+; CI-NEXT:    v_mov_b32_e32 v9, v1
+; CI-NEXT:    v_mov_b32_e32 v10, v2
+; CI-NEXT:    v_mov_b32_e32 v11, v3
+; CI-NEXT:    v_mov_b32_e32 v12, v4
+; CI-NEXT:    v_mov_b32_e32 v13, v5
+; CI-NEXT:    v_mov_b32_e32 v14, v6
+; CI-NEXT:    v_mov_b32_e32 v16, v0
+; CI-NEXT:    v_mov_b32_e32 v17, v1
+; CI-NEXT:    v_mov_b32_e32 v18, v2
+; CI-NEXT:    v_mov_b32_e32 v19, v3
+; CI-NEXT:    v_mov_b32_e32 v20, v4
+; CI-NEXT:    v_mov_b32_e32 v21, v5
+; CI-NEXT:    v_mov_b32_e32 v24, v0
+; CI-NEXT:    v_mov_b32_e32 v25, v1
+; CI-NEXT:    v_mov_b32_e32 v26, v2
+; CI-NEXT:    v_mov_b32_e32 v27, v3
+; CI-NEXT:    v_mov_b32_e32 v28, v4
+; CI-NEXT:    v_mov_b32_e32 v29, v5
+; CI-NEXT:    v_mov_b32_e32 v22, v6
+; CI-NEXT:    v_mov_b32_e32 v30, v6
+; CI-NEXT:    v_mov_b32_e32 v15, v7
+; CI-NEXT:    v_mov_b32_e32 v23, v7
+; CI-NEXT:    v_mov_b32_e32 v31, v7
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v32bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v4, v0
+; GFX89-NEXT:    v_mov_b32_e32 v5, v1
+; GFX89-NEXT:    v_mov_b32_e32 v6, v2
+; GFX89-NEXT:    v_mov_b32_e32 v7, v3
+; GFX89-NEXT:    v_mov_b32_e32 v8, v0
+; GFX89-NEXT:    v_mov_b32_e32 v9, v1
+; GFX89-NEXT:    v_mov_b32_e32 v10, v2
+; GFX89-NEXT:    v_mov_b32_e32 v11, v3
+; GFX89-NEXT:    v_mov_b32_e32 v12, v0
+; GFX89-NEXT:    v_mov_b32_e32 v13, v1
+; GFX89-NEXT:    v_mov_b32_e32 v14, v2
+; GFX89-NEXT:    v_mov_b32_e32 v15, v3
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v32bf16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
+; GFX11-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
+; GFX11-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3
+; GFX11-NEXT:    v_dual_mov_b32 v12, v0 :: v_dual_mov_b32 v13, v1
+; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v15, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <32 x bfloat>, ptr addrspace(1) undef
+  ret <32 x bfloat> %val
+}
+
 attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f827a78125b778..bdaa224439c534 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -53,6 +53,22 @@ declare hidden amdgpu_gfx void @external_void_func_v3f16(<3 x half>) #0
 declare hidden amdgpu_gfx void @external_void_func_v4i16(<4 x i16>) #0
 declare hidden amdgpu_gfx void @external_void_func_v4f16(<4 x half>) #0
 
+declare hidden amdgpu_gfx void @external_void_func_bf16(bfloat) #0
+declare hidden amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat>) #0
+declare hidden amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat>) #0
+declare hidden amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat>) #0
+declare hidden amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat>) #0
+declare hidden amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat>) #0
+declare hidden amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat>) #0
+
+declare hidden amdgpu_gfx void @external_void_func_bf16_inreg(bfloat inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v1bf16_inreg(<1 x bfloat> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v3bf16_inreg(<3 x bfloat> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v4bf16_inreg(<4 x bfloat> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v8bf16_inreg(<8 x bfloat> inreg) #0
+declare hidden amdgpu_gfx void @external_void_func_v16bf16_inreg(<16 x bfloat> inreg) #0
+
 declare hidden amdgpu_gfx void @external_void_func_v2i32(<2 x i32>) #0
 declare hidden amdgpu_gfx void @external_void_func_v3i32(<3 x i32>) #0
 declare hidden amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
@@ -16570,6 +16586,1634 @@ entry:
   ret void
 }
 
+define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i16 %arg to bfloat
+  call amdgpu_gfx void @external_void_func_bf16(bfloat %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v1bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v1bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v1bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i16 %arg to <1 x bfloat>
+  call amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i32 %arg to <2 x bfloat>
+  call amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <3 x i16> %arg to <3 x bfloat>
+  call amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <4 x i16> %arg to <4 x bfloat>
+  call amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <8 x i16> %arg to <8 x bfloat>
+  call amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <16 x i16> %arg to <16 x bfloat>
+  call amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat> %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i16 %arg to bfloat
+  call amdgpu_gfx void @external_void_func_bf16(bfloat inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v1bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v1bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i16 %arg to <1 x bfloat>
+  call amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat> inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast i32 %arg to <2 x bfloat>
+  call amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat> inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v3bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v3bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s5, 0
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s5, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 0
+; GFX10-SCRATCH-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <3 x i16> %arg to <3 x bfloat>
+  call amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat> inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v4bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v4bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <4 x i16> %arg to <4 x bfloat>
+  call amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat> inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v8bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v8bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <8 x i16> %arg to <8 x bfloat>
+  call amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat> inreg %val)
+  ret void
+}
+
+define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v16bf16_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s34, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s34
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_call_external_void_func_v16bf16_inreg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s34, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg:
+; GFX10-SCRATCH:       ; %bb.0:
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
+; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
+  %val = bitcast <16 x i16> %arg to <16 x bfloat>
+  call amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat> inreg %val)
+  ret void
+}
+
 declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) #0
 declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0
 declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,

diff  --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 9542fb50d3ccd8..820510df5c6eba 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1501,6 +1501,380 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
   ret void
 }
 
+define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[0:1], 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_and_b32 s2, s4, -4
+; GFX900-NEXT:    s_mov_b32 s3, s5
+; GFX900-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX900-NEXT:    s_and_b32 s4, s4, 3
+; GFX900-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX900-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX900-NEXT:    s_not_b32 s5, s5
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX900-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT:    global_store_short v[0:1], v0, off
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT:    s_mov_b64 s[0:1], 0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_and_b32 s2, s4, -4
+; GFX908-NEXT:    s_mov_b32 s3, s5
+; GFX908-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX908-NEXT:    s_and_b32 s4, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX908-NEXT:    s_not_b32 s5, s5
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT:    global_store_short v[0:1], v0, off
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_and_b32 s2, s4, -4
+; GFX90A-NEXT:    s_mov_b32 s3, s5
+; GFX90A-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX90A-NEXT:    s_and_b32 s4, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX90A-NEXT:    s_not_b32 s5, s5
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s5, v1
+; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT:    global_store_short v[0:1], v0, off
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s0, s2, -4
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_and_b32 s2, s2, 3
+; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX10-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX10-NEXT:    s_not_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX10-NEXT:    global_store_short v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s2, -4
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_and_b32 s2, s2, 3
+; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX11-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX11-NEXT:    s_not_b32 s4, s4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("agent") seq_cst
+  store bfloat %result, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[0:1], 0
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_and_b32 s2, s4, -4
+; GFX900-NEXT:    s_mov_b32 s3, s5
+; GFX900-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX900-NEXT:    s_and_b32 s4, s4, 3
+; GFX900-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX900-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX900-NEXT:    s_not_b32 s5, s5
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s6
+; GFX900-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_mov_b32_e32 v2, v1
+; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX900-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX900-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX900-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX900-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX900-NEXT:    global_store_short v[0:1], v0, off
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX908-NEXT:    s_mov_b64 s[0:1], 0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_and_b32 s2, s4, -4
+; GFX908-NEXT:    s_mov_b32 s3, s5
+; GFX908-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX908-NEXT:    s_and_b32 s4, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX908-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX908-NEXT:    s_not_b32 s5, s5
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX908-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[2:3] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX908-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX908-NEXT:    global_store_short v[0:1], v0, off
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_and_b32 s2, s4, -4
+; GFX90A-NEXT:    s_mov_b32 s3, s5
+; GFX90A-NEXT:    s_load_dword s6, s[2:3], 0x0
+; GFX90A-NEXT:    s_and_b32 s4, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX90A-NEXT:    s_lshl_b32 s5, 0xffff, s4
+; GFX90A-NEXT:    s_not_b32 s5, s5
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v2, v3, s5, v1
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
+; GFX90A-NEXT:    global_store_short v[0:1], v0, off
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s0, s2, -4
+; GFX10-NEXT:    s_mov_b32 s1, s3
+; GFX10-NEXT:    s_and_b32 s2, s2, 3
+; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX10-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX10-NEXT:    s_not_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX10-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX10-NEXT:    global_store_short v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s2, -4
+; GFX11-NEXT:    s_mov_b32 s1, s3
+; GFX11-NEXT:    s_and_b32 s2, s2, 3
+; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX11-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX11-NEXT:    s_not_b32 s4, s4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    s_mov_b32 s3, 0
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX11-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("one-as") seq_cst
+  store bfloat %result, ptr addrspace(1) undef
+  ret void
+}
+
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
new file mode 100644
index 00000000000000..7723a724c60866
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -0,0 +1,3093 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; FIXME: globalisel crashes on v3
+; RUN:  llc -march=amdgcn -mcpu=gfx704 < %s  | FileCheck --check-prefixes=GFX7CHECK,GFX7SELDAG %s
+; xUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx704 < %s  | FileCheck --check-prefixes=GFX7CHECK,GFX7GLISEL %s
+; RUN:  llc -march=amdgcn -mcpu=gfx803 < %s  | FileCheck --check-prefixes=GFX8CHECK,GFX8SELDAG %s
+; xUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx803 < %s  | FileCheck --check-prefixes=GFX8CHECK,GFX8GLISEL %s
+; RUN:  llc -march=amdgcn -mcpu=gfx908 < %s  | FileCheck --check-prefixes=GFX9CHECK,GFX9SELDAG %s
+; xUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx908 < %s  | FileCheck --check-prefixes=GFX9CHECK,GFX9GLISEL %s
+; RUN:  llc -march=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10SELDAG %s
+; xUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10GLISEL %s
+; RUN:  llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG %s
+; xUN:  llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL %s
+
+define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
+; GFX7CHECK-LABEL: sgpr_isnan_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GFX7CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7CHECK-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7CHECK-NEXT:    s_mov_b32 s2, -1
+; GFX7CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7CHECK-NEXT:    s_and_b32 s4, s4, 0x7fff
+; GFX7CHECK-NEXT:    s_cmpk_gt_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; GFX7CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7CHECK-NEXT:    s_endpgm
+;
+; GFX8CHECK-LABEL: sgpr_isnan_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX8CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8CHECK-NEXT:    s_movk_i32 s3, 0x7f80
+; GFX8CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s3, v0
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT:    flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT:    s_endpgm
+;
+; GFX9CHECK-LABEL: sgpr_isnan_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9CHECK-NEXT:    s_movk_i32 s0, 0x7f80
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s0, v1
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9CHECK-NEXT:    s_endpgm
+;
+; GFX10CHECK-LABEL: sgpr_isnan_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_clause 0x1
+; GFX10CHECK-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX10CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_and_b32_e64 v0, 0x7fff, s2
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10CHECK-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10CHECK-NEXT:    s_endpgm
+;
+; GFX11CHECK-LABEL: sgpr_isnan_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_clause 0x1
+; GFX11CHECK-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11CHECK-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_and_b32_e64 v0, 0x7fff, s2
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11CHECK-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11CHECK-NEXT:    s_nop 0
+; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11CHECK-NEXT:    s_endpgm
+  %result = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3)
+  %sext = sext i1 %result to i32
+  store i32 %sext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define i1 @zeromask_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: zeromask_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: zeromask_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: zeromask_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: zeromask_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: zeromask_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 0)
+  ret i1 %1
+}
+
+; FIXME: DAG and GlobalISel return 
diff erent values for i1 true
+define i1 @allflags_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: allflags_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: allflags_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: allflags_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: allflags_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: allflags_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1023) ; 0x3ff
+  ret i1 %1
+}
+
+define i1 @snan_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: snan_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: snan_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: snan_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: snan_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: snan_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1)  ; 0x001
+  ret i1 %1
+}
+
+define i1 @qnan_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: qnan_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: qnan_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: qnan_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: qnan_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: qnan_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 2)  ; 0x002
+  ret i1 %1
+}
+
+define i1 @posinf_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: posinf_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: posinf_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: posinf_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: posinf_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: posinf_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 512)  ; 0x200
+  ret i1 %1
+}
+
+define i1 @neginf_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: neginf_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    s_mov_b32 s4, 0xff80
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: neginf_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0xff80
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: neginf_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0xff80
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: neginf_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffffff80
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: neginf_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 4)  ; 0x004
+  ret i1 %1
+}
+
+define i1 @posnormal_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: posnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: posnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: posnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: posnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: posnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 256)  ; 0x100
+  ret i1 %1
+}
+
+define i1 @negnormal_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: negnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: negnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: negnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: negnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: negnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 8)  ; 0x008
+  ret i1 %1
+}
+
+define i1 @possubnormal_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: possubnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: possubnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, -1
+; GFX8CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: possubnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, -1
+; GFX9CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: possubnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: possubnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 128)  ; 0x080
+  ret i1 %1
+}
+
+define i1 @negsubnormal_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: negsubnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e64 v0, s[4:5], -1, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: negsubnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: negsubnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: negsubnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: negsubnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 16)  ; 0x010
+  ret i1 %1
+}
+
+define i1 @poszero_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: poszero_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: poszero_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: poszero_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: poszero_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: poszero_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 64)  ; 0x040
+  ret i1 %1
+}
+
+define i1 @negzero_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: negzero_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    s_mov_b32 s4, 0x8000
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: negzero_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x8000
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: negzero_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: negzero_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: negzero_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 32)  ; 0x020
+  ret i1 %1
+}
+
+define i1 @posfinite_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: posfinite_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: posfinite_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: posfinite_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: posfinite_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: posfinite_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 448)  ; 0x1c0
+  ret i1 %1
+}
+
+define i1 @negfinite_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: negfinite_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: negfinite_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: negfinite_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: negfinite_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: negfinite_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 56)  ; 0x038
+  ret i1 %1
+}
+
+define i1 @isnan_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: isnan_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnan_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnan_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnan_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnan_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3)  ; nan
+  ret i1 %1
+}
+
+define i1 @not_isnan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_isnan_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_isnan_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_isnan_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_isnan_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_isnan_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1020)  ; ~nan
+  ret i1 %class
+}
+
+define <2 x i1> @isnan_v2bf16(<2 x bfloat> %x) nounwind {
+; GFX7CHECK-LABEL: isnan_v2bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_bfe_u32 v1, v1, 16, 15
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v1
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnan_v2bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnan_v2bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v1, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnan_v2bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7f80
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnan_v2bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> %x, i32 3)  ; nan
+  ret <2 x i1> %1
+}
+
+define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind {
+; GFX7CHECK-LABEL: isnan_v3bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_bfe_u32 v1, v1, 16, 15
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v2, v2, 16, 15
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v1
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v2
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnan_v3bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnan_v3bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnan_v3bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnan_v3bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
+; GFX11CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> %x, i32 3)  ; nan
+  ret <3 x i1> %1
+}
+
+define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind {
+; GFX7CHECK-LABEL: isnan_v4bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_bfe_u32 v1, v1, 16, 15
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v2, v2, 16, 15
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v1
+; GFX7CHECK-NEXT:    v_bfe_u32 v3, v3, 16, 15
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v2
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v3
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnan_v4bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v4
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnan_v4bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    s_movk_i32 s5, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v4, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v1
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v4
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnan_v4bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v5
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnan_v4bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> %x, i32 3)  ; nan
+  ret <4 x i1> %1
+}
+
+define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
+; GFX7CHECK-LABEL: isnan_bf16_strictfp:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnan_bf16_strictfp:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnan_bf16_strictfp:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnan_bf16_strictfp:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnan_bf16_strictfp:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3) strictfp ; nan
+  ret i1 %1
+}
+
+define i1 @isinf_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: isinf_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isinf_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isinf_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isinf_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isinf_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516)  ; 0x204 = "inf"
+  ret i1 %1
+}
+
+define i1 @isfinite_bf16(bfloat %x) nounwind {
+; GFX7CHECK-LABEL: isfinite_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isfinite_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isfinite_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isfinite_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isfinite_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504)  ; 0x1f8 = "finite"
+  ret i1 %1
+}
+
+define i1 @issubnormal_or_zero_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: issubnormal_or_zero_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: issubnormal_or_zero_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: issubnormal_or_zero_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: issubnormal_or_zero_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: issubnormal_or_zero_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 240)  ; 0xf0 = "subnormal|zero"
+  ret i1 %class
+}
+
+define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_issubnormal_or_zero_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_issubnormal_or_zero_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_issubnormal_or_zero_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_issubnormal_or_zero_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+    %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
+  ret i1 %class
+}
+
+define i1 @isnormal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: isnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 264)  ; 0x108 = "normal"
+  ret i1 %class
+}
+
+define i1 @not_isnormal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_isnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_isnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_isnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_isnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_isnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 759)  ; ~0x108 = "~normal"
+  ret i1 %class
+}
+
+define i1 @not_is_plus_normal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_is_plus_normal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7eff
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_is_plus_normal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_is_plus_normal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_is_plus_normal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_is_plus_normal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 767)  ; ~0x100 = ~"+normal"
+  ret i1 %class
+}
+
+define i1 @not_is_neg_normal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_is_neg_normal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7eff
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v1
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_is_neg_normal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_is_neg_normal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_is_neg_normal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_is_neg_normal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1015)  ; ~0x008 = ~"-normal"
+  ret i1 %class
+}
+
+define i1 @issubnormal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: issubnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: issubnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: issubnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: issubnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: issubnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 144)  ; 0x90 = "subnormal"
+  ret i1 %class
+}
+
+define i1 @not_issubnormal_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_issubnormal_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7e
+; GFX7CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_issubnormal_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7e
+; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_issubnormal_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7e
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_issubnormal_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_issubnormal_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
+; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 879)  ; ~0x90 = ~"subnormal"
+  ret i1 %class
+}
+
+define i1 @iszero_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: iszero_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 96)  ; 0x60 = "zero"
+  ret i1 %class
+}
+
+define i1 @not_iszero_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_iszero_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 927)  ; ~0x60 = ~"zero"
+  ret i1 %class
+}
+
+define i1 @ispositive_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: ispositive_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: ispositive_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: ispositive_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: ispositive_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f81
+; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: ispositive_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 960)  ; fcPositive
+  ret i1 %class
+}
+
+define i1 @not_ispositive_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_ispositive_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s6, v0
+; GFX7CHECK-NEXT:    s_mov_b32 s7, 0xff80
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v1
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_ispositive_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s6, v1
+; GFX8CHECK-NEXT:    s_movk_i32 s7, 0xff80
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_ispositive_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s6, v1
+; GFX9CHECK-NEXT:    s_movk_i32 s7, 0xff80
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_ispositive_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s6, 0x7f80, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s6
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_ispositive_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s2, 0x7f80, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s2
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 63)  ; ~fcPositive
+  ret i1 %class
+}
+
+define i1 @isnegative_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: isnegative_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
+; GFX7CHECK-NEXT:    s_mov_b32 s6, 0xff80
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isnegative_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v1
+; GFX8CHECK-NEXT:    s_movk_i32 s6, 0xff80
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isnegative_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e64 s[4:5], s4, v1
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0xff80
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isnegative_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isnegative_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 60)  ; fcNegative
+  ret i1 %class
+}
+
+define i1 @not_isnegative_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_isnegative_bf16:
+; GFX7CHECK:       ; %bb.0:
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v1
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_isnegative_bf16:
+; GFX8CHECK:       ; %bb.0:
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_isnegative_bf16:
+; GFX9CHECK:       ; %bb.0:
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_isnegative_bf16:
+; GFX10CHECK:       ; %bb.0:
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v1
+; GFX10CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_isnegative_bf16:
+; GFX11CHECK:       ; %bb.0:
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v1
+; GFX11CHECK-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 963)  ; ~fcNegative
+  ret i1 %class
+}
+
+define i1 @iszero_or_nan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: iszero_or_nan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_or_nan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_or_nan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_or_nan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_or_nan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
+  ret i1 %0
+}
+
+define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
+; GFX7CHECK-LABEL: iszero_or_nan_f_daz:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
+  ret i1 %0
+}
+
+define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
+; GFX7CHECK-LABEL: iszero_or_nan_f_maybe_daz:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99)  ; 0x60|0x3 = "zero|nan"
+  ret i1 %0
+}
+
+define i1 @not_iszero_or_nan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_iszero_or_nan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_or_nan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_or_nan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_or_nan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_or_nan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~0x60 = "~(zero|nan)"
+  ret i1 %0
+}
+
+define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
+; GFX7CHECK-LABEL: not_iszero_or_nan_f_daz:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
+  ret i1 %0
+}
+
+define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
+; GFX7CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
+  ret i1 %0
+}
+
+define i1 @iszero_or_qnan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: iszero_or_qnan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_or_qnan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_or_qnan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_or_qnan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_or_qnan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 98)  ; 0x60|0x2 = "zero|qnan"
+  ret i1 %0
+}
+
+define i1 @iszero_or_snan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: iszero_or_snan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s4, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: iszero_or_snan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: iszero_or_snan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s4, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: iszero_or_snan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    s_or_b32 s4, s5, s4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: iszero_or_snan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0, v0
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    s_or_b32 s0, s1, s0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 97)  ; 0x60|0x1 = "zero|snan"
+  ret i1 %0
+}
+
+define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_iszero_or_qnan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX7CHECK-NEXT:    s_movk_i32 s8, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e64 s[4:5], s8, v0
+; GFX7CHECK-NEXT:    s_and_b64 s[6:7], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], -1, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v1
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX8CHECK-NEXT:    s_movk_i32 s8, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s8, v0
+; GFX8CHECK-NEXT:    s_and_b64 s[6:7], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v0
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v1
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
+; GFX9CHECK-NEXT:    s_movk_i32 s8, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e64 s[4:5], s8, v0
+; GFX9CHECK-NEXT:    s_and_b64 s[6:7], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v0
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v1
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s6, 0x7f, v1
+; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
+; GFX10CHECK-NEXT:    s_or_b32 s5, s6, s5
+; GFX10CHECK-NEXT:    s_or_b32 s4, s5, s4
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s2, 0x7f, v1
+; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
+; GFX11CHECK-NEXT:    s_or_b32 s1, s2, s1
+; GFX11CHECK-NEXT:    s_or_b32 s0, s1, s0
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 925)  ; ~(0x60|0x2) = "~(zero|qnan)"
+  ret i1 %0
+}
+
+define i1 @not_iszero_or_snan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_iszero_or_snan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], -1, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v1
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7fbf
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_add_i32_e32 v0, vcc, 0xffffff80, v0
+; GFX7CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX7CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GFX7CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_iszero_or_snan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v1
+; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7fbf
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v0
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
+; GFX8CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_iszero_or_snan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e64 s[4:5], s4, v1
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7fbf
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v0
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f00
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s6, v0
+; GFX9CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_iszero_or_snan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
+; GFX10CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s5, 0x7fbf, v0
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v1
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s6, 0x7f00, v2
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s5
+; GFX10CHECK-NEXT:    s_or_b32 s4, s4, s6
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_iszero_or_snan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
+; GFX11CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s1, 0x7fbf, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v1
+; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s2, 0x7f00, v2
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s1
+; GFX11CHECK-NEXT:    s_or_b32 s0, s0, s2
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 926)  ; ~(0x60|0x1) = "~(zero|snan)"
+  ret i1 %0
+}
+
+define i1 @isinf_or_nan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: isinf_or_nan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
+; GFX7CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isinf_or_nan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isinf_or_nan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isinf_or_nan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isinf_or_nan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 519)  ; 0x204|0x3 = "inf|nan"
+  ret i1 %0
+}
+
+define i1 @not_isinf_or_nan_bf16(bfloat %x) {
+; GFX7CHECK-LABEL: not_isinf_or_nan_bf16:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_isinf_or_nan_bf16:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_isinf_or_nan_bf16:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_isinf_or_nan_bf16:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_isinf_or_nan_bf16:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504)  ; ~(0x204|0x3) = "~(inf|nan)"
+  ret i1 %0
+}
+
+define i1 @isfinite_or_nan_f(bfloat %x) {
+; GFX7CHECK-LABEL: isfinite_or_nan_f:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: isfinite_or_nan_f:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: isfinite_or_nan_f:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: isfinite_or_nan_f:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: isfinite_or_nan_f:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 507)  ; 0x1f8|0x3 = "finite|nan"
+  ret i1 %0
+}
+
+define i1 @not_isfinite_or_nan_f(bfloat %x) {
+; GFX7CHECK-LABEL: not_isfinite_or_nan_f:
+; GFX7CHECK:       ; %bb.0: ; %entry
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX7CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
+; GFX8CHECK:       ; %bb.0: ; %entry
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
+; GFX9CHECK:       ; %bb.0: ; %entry
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
+; GFX10CHECK:       ; %bb.0: ; %entry
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
+; GFX11CHECK:       ; %bb.0: ; %entry
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516)  ; ~(0x1f8|0x3) = "~(finite|nan)"
+  ret i1 %0
+}
+
+declare i1 @llvm.is.fpclass.bf16(bfloat, i32)
+declare <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat>, i32)
+declare <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat>, i32)
+declare <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat>, i32)
+
+; Assume DAZ
+attributes #0 = { "denormal-fp-math"="ieee,preserve-sign" }
+
+; Maybe daz
+attributes #1 = { "denormal-fp-math"="ieee,dynamic" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10SELDAG: {{.*}}
+; GFX11SELDAG: {{.*}}
+; GFX7SELDAG: {{.*}}
+; GFX8SELDAG: {{.*}}
+; GFX9SELDAG: {{.*}}
+; GFX7GLISEL: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index a6f8b5a862b723..b90c92bf9be52a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1403,3 +1403,265 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
   %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret void
 }
+
+define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_ret_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, -4, v0
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b32 v3, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_mov_b32 s4, 0xffff
+; VI-NEXT:    v_and_b32_e32 v0, 24, v2
+; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT:    v_not_b32_e32 v2, v2
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB10_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; VI-NEXT:    v_and_b32_e32 v5, v4, v2
+; VI-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v3, v5, v3
+; VI-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB10_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_lshrrev_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX9-NEXT:    ds_read_b32 v2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s4
+; GFX9-NEXT:    v_not_b32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v1, v4, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_not_b32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX8-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_not_b32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
+  ret bfloat %result
+}
+
+define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_noret_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, -4, v0
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b32 v3, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_mov_b32 s4, 0xffff
+; VI-NEXT:    v_and_b32_e32 v0, 24, v2
+; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; VI-NEXT:    v_not_b32_e32 v2, v2
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB11_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; VI-NEXT:    v_and_b32_e32 v5, v3, v2
+; VI-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v3, v4
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB11_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX9-NEXT:    ds_read_b32 v3, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX9-NEXT:    v_not_b32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v3, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_not_b32_e32 v2, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v3, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX8-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_not_b32_e32 v2, v2
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 73457654307019..81ad1604756835 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -3,8 +3,9 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
 define i32 @load_atomic_private_seq_cst_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @load_atomic_private_seq_cst_i32(
-; IR-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @load_atomic_private_seq_cst_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; IR-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[LOAD]]
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_i32:
@@ -18,8 +19,9 @@ define i32 @load_atomic_private_seq_cst_i32(ptr addrspace(5) %ptr) {
 }
 
 define i64 @load_atomic_private_seq_cst_i64(ptr addrspace(5) %ptr) {
-; IR-LABEL: @load_atomic_private_seq_cst_i64(
-; IR-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8
+; IR-LABEL: define i64 @load_atomic_private_seq_cst_i64(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    ret i64 [[LOAD]]
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_i64:
@@ -35,8 +37,9 @@ define i64 @load_atomic_private_seq_cst_i64(ptr addrspace(5) %ptr) {
 }
 
 define void @atomic_store_seq_cst_i32(ptr addrspace(5) %ptr, i32 %val) {
-; IR-LABEL: @atomic_store_seq_cst_i32(
-; IR-NEXT:    store i32 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define void @atomic_store_seq_cst_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_store_seq_cst_i32:
@@ -50,8 +53,9 @@ define void @atomic_store_seq_cst_i32(ptr addrspace(5) %ptr, i32 %val) {
 }
 
 define void @atomic_store_seq_cst_i64(ptr addrspace(5) %ptr, i64 %val) {
-; IR-LABEL: @atomic_store_seq_cst_i64(
-; IR-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 8
+; IR-LABEL: define void @atomic_store_seq_cst_i64(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]], i64 [[VAL:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    store i64 [[VAL]], ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_store_seq_cst_i64:
@@ -67,8 +71,9 @@ define void @atomic_store_seq_cst_i64(ptr addrspace(5) %ptr, i64 %val) {
 }
 
 define i32 @load_atomic_private_seq_cst_syncscope_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @load_atomic_private_seq_cst_syncscope_i32(
-; IR-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @load_atomic_private_seq_cst_syncscope_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[LOAD]]
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_syncscope_i32:
@@ -82,8 +87,9 @@ define i32 @load_atomic_private_seq_cst_syncscope_i32(ptr addrspace(5) %ptr) {
 }
 
 define void @atomic_store_seq_cst_syncscope_i32(ptr addrspace(5) %ptr, i32 %val) {
-; IR-LABEL: @atomic_store_seq_cst_syncscope_i32(
-; IR-NEXT:    store i32 [[VAL:%.*]], ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define void @atomic_store_seq_cst_syncscope_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: atomic_store_seq_cst_syncscope_i32:
@@ -97,8 +103,9 @@ define void @atomic_store_seq_cst_syncscope_i32(ptr addrspace(5) %ptr, i32 %val)
 }
 
 define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @cmpxchg_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @cmpxchg_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; IR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 1, i32 [[TMP1]]
 ; IR-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[PTR]], align 4
@@ -133,8 +140,9 @@ define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
-; IR-LABEL: @cmpxchg_private_i64(
-; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8
+; IR-LABEL: define i64 @cmpxchg_private_i64(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]]
 ; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 8
@@ -174,8 +182,9 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 
 
 define i32 @atomicrmw_xchg_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_xchg_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_xchg_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    store i32 4, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
 ;
@@ -194,8 +203,9 @@ define i32 @atomicrmw_xchg_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_add_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_add_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_add_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = add i32 [[TMP1]], 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
@@ -215,8 +225,9 @@ define i32 @atomicrmw_add_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_sub_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_sub_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_sub_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = sub i32 [[TMP1]], 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
@@ -236,8 +247,9 @@ define i32 @atomicrmw_sub_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_and_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_and_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_and_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = and i32 [[TMP1]], 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
@@ -257,8 +269,9 @@ define i32 @atomicrmw_and_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_nand_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_nand_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
@@ -280,8 +293,9 @@ define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_or_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_or_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_or_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = or i32 [[TMP1]], 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
@@ -301,8 +315,9 @@ define i32 @atomicrmw_or_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_xor_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_xor_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_xor_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = xor i32 [[TMP1]], 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret i32 [[TMP1]]
@@ -322,8 +337,9 @@ define i32 @atomicrmw_xor_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_max_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_max_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_max_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
@@ -344,8 +360,9 @@ define i32 @atomicrmw_max_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_min_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_min_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_min_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = icmp sle i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
@@ -366,8 +383,9 @@ define i32 @atomicrmw_min_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_umax_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_umax_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_umax_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
@@ -388,8 +406,9 @@ define i32 @atomicrmw_umax_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_umin_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_umin_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_umin_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = icmp ule i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 4
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
@@ -409,14 +428,15 @@ define i32 @atomicrmw_umin_private_i32(ptr addrspace(5) %ptr) {
   ret i32 %result
 }
 
-define float @atomicrmw_fadd_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_fadd_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR:%.*]], align 4
+define float @atomicrmw_fadd_private_f32(ptr addrspace(5) %ptr) {
+; IR-LABEL: define float @atomicrmw_fadd_private_f32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = fadd float [[TMP1]], 2.000000e+00
 ; IR-NEXT:    store float [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret float [[TMP1]]
 ;
-; GCN-LABEL: atomicrmw_fadd_private_i32:
+; GCN-LABEL: atomicrmw_fadd_private_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -430,10 +450,35 @@ define float @atomicrmw_fadd_private_i32(ptr addrspace(5) %ptr) {
   ret float %result
 }
 
+define bfloat @atomicrmw_fadd_private_bf16(ptr addrspace(5) %ptr) {
+; IR-LABEL: define bfloat @atomicrmw_fadd_private_bf16(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load bfloat, ptr addrspace(5) [[PTR]], align 2
+; IR-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP1]], 0xR4000
+; IR-NEXT:    store bfloat [[NEW]], ptr addrspace(5) [[PTR]], align 2
+; IR-NEXT:    ret bfloat [[TMP1]]
+;
+; GCN-LABEL: atomicrmw_fadd_private_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_ushort v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_add_f32_e32 v2, 2.0, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(5) %ptr, bfloat 2.0 seq_cst
+  ret bfloat %result
+}
+
 define float @atomicrmw_fsub_private_i32(ptr addrspace(5) %ptr, float %val) {
-; IR-LABEL: @atomicrmw_fsub_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR:%.*]], align 4
-; IR-NEXT:    [[NEW:%.*]] = fsub float [[TMP1]], [[VAL:%.*]]
+; IR-LABEL: define float @atomicrmw_fsub_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]], float [[VAL:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT:    [[NEW:%.*]] = fsub float [[TMP1]], [[VAL]]
 ; IR-NEXT:    store float [[NEW]], ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    ret float [[TMP1]]
 ;
@@ -452,17 +497,18 @@ define float @atomicrmw_fsub_private_i32(ptr addrspace(5) %ptr, float %val) {
 }
 
 define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addrspace(1) %out, i32 %in) nounwind {
-; IR-LABEL: @alloca_promote_atomicrmw_private_lds_promote(
+; IR-LABEL: define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(
+; IR-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP:%.*]] = alloca [2 x i32], align 4, addrspace(5)
 ; IR-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 1
 ; IR-NEXT:    store i32 0, ptr addrspace(5) [[TMP]], align 4
 ; IR-NEXT:    store i32 1, ptr addrspace(5) [[GEP2]], align 4
-; IR-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN:%.*]]
+; IR-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[GEP3]], align 4
 ; IR-NEXT:    [[NEW:%.*]] = add i32 [[TMP0]], 7
 ; IR-NEXT:    store i32 [[NEW]], ptr addrspace(5) [[GEP3]], align 4
-; IR-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: alloca_promote_atomicrmw_private_lds_promote:
@@ -489,13 +535,14 @@ entry:
 }
 
 define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out, i32 %in) nounwind {
-; IR-LABEL: @alloca_promote_cmpxchg_private(
+; IR-LABEL: define amdgpu_kernel void @alloca_promote_cmpxchg_private(
+; IR-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]]) #[[ATTR1]] {
 ; IR-NEXT:  entry:
 ; IR-NEXT:    [[TMP:%.*]] = alloca [2 x i32], align 4, addrspace(5)
 ; IR-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 1
 ; IR-NEXT:    store i32 0, ptr addrspace(5) [[TMP]], align 4
 ; IR-NEXT:    store i32 1, ptr addrspace(5) [[GEP2]], align 4
-; IR-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN:%.*]]
+; IR-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr addrspace(5) [[TMP]], i32 0, i32 [[IN]]
 ; IR-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[GEP3]], align 4
 ; IR-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 1, i32 [[TMP0]]
@@ -503,7 +550,7 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out,
 ; IR-NEXT:    [[TMP3:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP0]], 0
 ; IR-NEXT:    [[TMP4:%.*]] = insertvalue { i32, i1 } [[TMP3]], i1 [[TMP1]], 1
 ; IR-NEXT:    [[VAL:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; IR-NEXT:    store i32 [[VAL]], ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4
 ; IR-NEXT:    ret void
 ;
 ; GCN-LABEL: alloca_promote_cmpxchg_private:
@@ -531,8 +578,9 @@ entry:
 }
 
 define i32 @atomicrmw_inc_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_inc_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_inc_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
 ; IR-NEXT:    [[TMP3:%.*]] = icmp uge i32 [[TMP1]], 4
 ; IR-NEXT:    [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]]
@@ -556,8 +604,9 @@ define i32 @atomicrmw_inc_private_i32(ptr addrspace(5) %ptr) {
 }
 
 define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: @atomicrmw_dec_private_i32(
-; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-LABEL: define i32 @atomicrmw_dec_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
 ; IR-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
 ; IR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP1]], 0
 ; IR-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[TMP1]], 4

diff  --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index b35bcbff94d5bf..5b9866a3c91571 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -258,3 +258,180 @@ define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
   store volatile i64 %i7, ptr addrspace(1) undef, align 4
   ret void
 }
+
+; GCN-LABEL: {{^}}undef_bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi bfloat [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile bfloat, ptr addrspace(3) undef
+  %bc.0 = bitcast bfloat %load to i16
+  %bc.1 = bitcast bfloat %phi to i16
+  %add.i = add i16 %bc.0, %bc.1
+  %add = bitcast i16 %add.i to bfloat
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile bfloat %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v2bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <2 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <2 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <2 x bfloat> %load to <2 x i16>
+  %bc.1 = bitcast <2 x bfloat> %phi to <2 x i16>
+  %add.i = add <2 x i16> %bc.0, %bc.1
+  %add = bitcast <2 x i16> %add.i to <2 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <2 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v3bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <3 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <3 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <3 x bfloat> %load to <3 x i16>
+  %bc.1 = bitcast <3 x bfloat> %phi to <3 x i16>
+  %add.i = add <3 x i16> %bc.0, %bc.1
+  %add = bitcast <3 x i16> %add.i to <3 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <3 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v4bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <4 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <4 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <4 x bfloat> %load to <4 x i16>
+  %bc.1 = bitcast <4 x bfloat> %phi to <4 x i16>
+  %add.i = add <4 x i16> %bc.0, %bc.1
+  %add = bitcast <4 x i16> %add.i to <4 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <4 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v6bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <6 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <6 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <6 x bfloat> %load to <6 x i16>
+  %bc.1 = bitcast <6 x bfloat> %phi to <6 x i16>
+  %add.i = add <6 x i16> %bc.0, %bc.1
+  %add = bitcast <6 x i16> %add.i to <6 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <6 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v8bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <8 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <8 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <8 x bfloat> %load to <8 x i16>
+  %bc.1 = bitcast <8 x bfloat> %phi to <8 x i16>
+  %add.i = add <8 x i16> %bc.0, %bc.1
+  %add = bitcast <8 x i16> %add.i to <8 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <8 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v16bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <16 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <16 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <16 x bfloat> %load to <16 x i16>
+  %bc.1 = bitcast <16 x bfloat> %phi to <16 x i16>
+  %add.i = add <16 x i16> %bc.0, %bc.1
+  %add = bitcast <16 x i16> %add.i to <16 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <16 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v32bf16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <32 x bfloat> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <32 x bfloat>, ptr addrspace(3) undef
+  %bc.0 = bitcast <32 x bfloat> %load to <32 x i16>
+  %bc.1 = bitcast <32 x bfloat> %phi to <32 x i16>
+  %add.i = add <32 x i16> %bc.0, %bc.1
+  %add = bitcast <32 x i16> %add.i to <32 x bfloat>
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <32 x bfloat> %add, ptr addrspace(3) undef
+  ret void
+}
+


        


More information about the llvm-commits mailing list