[llvm] 76db473 - [AArch64] Add bf16 instruction coverage. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 6 03:38:07 PST 2024


Author: David Green
Date: 2024-12-06T11:38:02Z
New Revision: 76db47335903cb65d3027c0a77658f488d8ce659

URL: https://github.com/llvm/llvm-project/commit/76db47335903cb65d3027c0a77658f488d8ce659
DIFF: https://github.com/llvm/llvm-project/commit/76db47335903cb65d3027c0a77658f488d8ce659.diff

LOG: [AArch64] Add bf16 instruction coverage. NFC

These are the same tests as fp16-instructions.ll, fp16-v4-instructions.ll and
fp16-v8-instruction.ll ported to bf16.

Added: 
    llvm/test/CodeGen/AArch64/bf16-instructions.ll
    llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
    llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
new file mode 100644
index 00000000000000..33997614598c3a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -0,0 +1,2347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fadd bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fsub:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fsub s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fsub:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fsub s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fsub bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fmul:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmul:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fmul bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    fmov w9, s2
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov w9, s2
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %mul = fmul fast bfloat %a, %b
+  %r = fadd fast bfloat %mul, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fdiv:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fdiv s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fdiv:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fdiv s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fdiv bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_frem:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl fmodf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_frem:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl fmodf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = frem bfloat %a, %b
+  ret bfloat %r
+}
+
+define void @test_store(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_store:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+  store bfloat %a, ptr %b
+  ret void
+}
+
+define bfloat @test_load(ptr %a) #0 {
+; CHECK-LABEL: test_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %r = load bfloat, ptr %a
+  ret bfloat %r
+}
+
+declare bfloat @test_callee(bfloat %a, bfloat %b) #0
+
+define bfloat @test_call(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call bfloat @test_callee(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_call_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tailcall_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    b test_callee
+  %r = tail call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = select i1 %c, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) #0 {
+; CHECK-LABEL: test_select_cc_f16_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmp s2, s3
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-NEXT:    ret
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, pl
+; CHECK-NEXT:    ret
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, le
+; CHECK-NEXT:    ret
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, vs
+; CHECK-NEXT:    ret
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    csinc w0, w8, wzr, le
+; CHECK-NEXT:    ret
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, mi
+; CHECK-NEXT:    ret
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, vc
+; CHECK-NEXT:    ret
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+define void @test_fccmp(bfloat %in, ptr %out) {
+; CHECK-LABEL: test_fccmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    movi v1.2s, #69, lsl #24
+; CHECK-NEXT:    movi v3.2s, #72, lsl #24
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    adrp x8, .LCPI29_0
+; CHECK-NEXT:    fcmp s2, s1
+; CHECK-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
+; CHECK-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+  %cmp1 = fcmp ogt bfloat %in, 0xR4800
+  %cmp2 = fcmp olt bfloat %in, 0xR4500
+  %cond = and i1 %cmp1, %cmp2
+  %result = select i1 %cond, bfloat %in, bfloat 0xR4500
+  store bfloat %result, ptr %out
+  ret void
+}
+
+define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
+; CHECK-LABEL: test_br_cc:
+; CHECK:       // %bb.0: // %common.ret
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csel x8, x0, x1, pl
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    ret
+  %c = fcmp uge bfloat %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, ptr %p1
+  ret void
+else:
+  store i32 0, ptr %p2
+  ret void
+}
+
+define bfloat @test_phi(ptr %p1) #0 {
+; CHECK-LABEL: test_phi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    ldr h9, [x0]
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:  .LBB31_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov s8, s9
+; CHECK-NEXT:    ldr h9, [x19]
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl test_dummy
+; CHECK-NEXT:    tbnz w0, #0, .LBB31_1
+; CHECK-NEXT:  // %bb.2: // %return
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p1
+  br label %loop
+loop:
+  %r = phi bfloat [%a, %entry], [%b, %loop]
+  %b = load bfloat, ptr %p1
+  %c = call i1 @test_dummy(ptr %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret bfloat %r
+}
+
+declare i1 @test_dummy(ptr %p1) #0
+
+define i32 @test_fptosi_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzs w0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptosi_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzs x0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi bfloat %a to i64
+  ret i64 %r
+}
+
+define i32 @test_fptoui_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzu w0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptoui_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzu x0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui bfloat %a to i64
+  ret i64 %r
+}
+
+define bfloat @test_uitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf d0, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf d0, w0
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = uitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    lsr x9, x0, #53
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    cmp x9, #0
+; CHECK-CVT-NEXT:    and x9, x0, #0xfffffffffffff000
+; CHECK-CVT-NEXT:    csel x9, x9, x0, ne
+; CHECK-CVT-NEXT:    ucvtf d0, x9
+; CHECK-CVT-NEXT:    cset w9, ne
+; CHECK-CVT-NEXT:    tst x0, #0xfff
+; CHECK-CVT-NEXT:    csel w9, wzr, w9, eq
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    orr x9, x10, x9
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    lsr x8, x0, #53
+; CHECK-BF16-NEXT:    and x9, x0, #0xfffffffffffff000
+; CHECK-BF16-NEXT:    cmp x8, #0
+; CHECK-BF16-NEXT:    csel x8, x9, x0, ne
+; CHECK-BF16-NEXT:    ucvtf d0, x8
+; CHECK-BF16-NEXT:    cset w8, ne
+; CHECK-BF16-NEXT:    tst x0, #0xfff
+; CHECK-BF16-NEXT:    csel w8, wzr, w8, eq
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    orr x8, x9, x8
+; CHECK-BF16-NEXT:    fmov d0, x8
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = uitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf d0, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf d0, w0
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = sitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    cmp x0, #0
+; CHECK-CVT-NEXT:    and x11, x0, #0x8000000000000000
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    cneg x9, x0, mi
+; CHECK-CVT-NEXT:    lsr x10, x9, #53
+; CHECK-CVT-NEXT:    cmp x10, #0
+; CHECK-CVT-NEXT:    and x10, x9, #0xfffffffffffff000
+; CHECK-CVT-NEXT:    csel x10, x10, x9, ne
+; CHECK-CVT-NEXT:    scvtf d0, x10
+; CHECK-CVT-NEXT:    cset w10, ne
+; CHECK-CVT-NEXT:    tst x9, #0xfff
+; CHECK-CVT-NEXT:    csel w10, wzr, w10, eq
+; CHECK-CVT-NEXT:    fmov x9, d0
+; CHECK-CVT-NEXT:    orr x9, x9, x11
+; CHECK-CVT-NEXT:    orr x9, x9, x10
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    cmp x0, #0
+; CHECK-BF16-NEXT:    cneg x8, x0, mi
+; CHECK-BF16-NEXT:    lsr x9, x8, #53
+; CHECK-BF16-NEXT:    and x10, x8, #0xfffffffffffff000
+; CHECK-BF16-NEXT:    cmp x9, #0
+; CHECK-BF16-NEXT:    csel x9, x10, x8, ne
+; CHECK-BF16-NEXT:    and x10, x0, #0x8000000000000000
+; CHECK-BF16-NEXT:    cset w11, ne
+; CHECK-BF16-NEXT:    scvtf d0, x9
+; CHECK-BF16-NEXT:    tst x8, #0xfff
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    orr x8, x9, x10
+; CHECK-BF16-NEXT:    csel w9, wzr, w11, eq
+; CHECK-BF16-NEXT:    orr x8, x8, x9
+; CHECK-BF16-NEXT:    fmov d0, x8
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = sitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf d1, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fcvtxn s1, d1
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w9, w9, w8
+; CHECK-CVT-NEXT:    add w9, w10, w9
+; CHECK-CVT-NEXT:    lsr w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    fmov w10, s1
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf d1, w0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fcvtxn s1, d1
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bfcvt h1, s1
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %c = uitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf d1, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fcvtxn s1, d1
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w9, w9, w8
+; CHECK-CVT-NEXT:    add w9, w10, w9
+; CHECK-CVT-NEXT:    lsr w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    fmov w10, s1
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf d1, w0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fcvtxn s1, d1
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bfcvt h1, s1
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %c = sitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_float(float %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_float:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcmp s0, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    orr w9, w9, #0x400000
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    csel w8, w9, w8, vs
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_float:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fptrunc float %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_double(double %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_double:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_double:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fptrunc double %a to bfloat
+  ret bfloat %r
+}
+
+define float @test_fpext_float(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %r = fpext bfloat %a to float
+  ret float %r
+}
+
+define double @test_fpext_double(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvt d0, s0
+; CHECK-NEXT:    ret
+  %r = fpext bfloat %a to double
+  ret double %r
+}
+
+define i16 @test_bitcast_bfloattoi16(bfloat %a) #0 {
+; CHECK-LABEL: test_bitcast_bfloattoi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %r = bitcast bfloat %a to i16
+  ret i16 %r
+}
+
+define bfloat @test_bitcast_i16tobfloat(i16 %a) #0 {
+; CHECK-LABEL: test_bitcast_i16tobfloat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = bitcast i16 %a to bfloat
+  ret bfloat %r
+}
+
+declare bfloat @llvm.sqrt.f16(bfloat %a) #0
+declare bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b) #0
+declare bfloat @llvm.sin.f16(bfloat %a) #0
+declare bfloat @llvm.cos.f16(bfloat %a) #0
+declare bfloat @llvm.tan.f16(bfloat %a) #0
+declare bfloat @llvm.asin.f16(bfloat %a) #0
+declare bfloat @llvm.acos.f16(bfloat %a) #0
+declare bfloat @llvm.atan.f16(bfloat %a) #0
+declare bfloat @llvm.atan2.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.sinh.f16(bfloat %a) #0
+declare bfloat @llvm.cosh.f16(bfloat %a) #0
+declare bfloat @llvm.tanh.f16(bfloat %a) #0
+declare bfloat @llvm.pow.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.exp.f16(bfloat %a) #0
+declare bfloat @llvm.exp2.f16(bfloat %a) #0
+declare bfloat @llvm.log.f16(bfloat %a) #0
+declare bfloat @llvm.log10.f16(bfloat %a) #0
+declare bfloat @llvm.log2.f16(bfloat %a) #0
+declare bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) #0
+declare bfloat @llvm.fabs.f16(bfloat %a) #0
+declare bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.floor.f16(bfloat %a) #0
+declare bfloat @llvm.ceil.f16(bfloat %a) #0
+declare bfloat @llvm.trunc.f16(bfloat %a) #0
+declare bfloat @llvm.rint.f16(bfloat %a) #0
+declare bfloat @llvm.nearbyint.f16(bfloat %a) #0
+declare bfloat @llvm.round.f16(bfloat %a) #0
+declare bfloat @llvm.roundeven.f16(bfloat %a) #0
+declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
+
+
+define bfloat @test_sqrt(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sqrt:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fsqrt s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sqrt:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fsqrt s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sqrt.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_powi(bfloat %a, i32 %b) #0 {
+; CHECK-CVT-LABEL: test_powi:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl __powisf2
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_powi:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl __powisf2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
+  ret bfloat %r
+}
+
+
+define bfloat @test_sin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sin:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl sinf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl sinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_cos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cos:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl cosf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_cos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl cosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.cos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tan:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl tanf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_tan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl tanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.tan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_acos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_acos:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl acosf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_acos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl acosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.acos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_asin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_asin:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl asinf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_asin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl asinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.asin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_atan:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl atanf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_atan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl atanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.atan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_atan2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl atan2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_atan2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl atan2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_cosh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cosh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl coshf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_cosh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl coshf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.cosh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_sinh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sinh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl sinhf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sinh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl sinhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sinh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tanh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tanh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl tanhf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_tanh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl tanhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.tanh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_pow:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl powf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_pow:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl powf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_exp(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl expf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_exp:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl expf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.exp.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_exp2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl exp2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_exp2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl exp2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.exp2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl logf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl logf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log10(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log10:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl log10f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log10:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl log10f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log10.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl log2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl log2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fma:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s2
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fma:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s2
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    fmov w10, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    lsl w10, w10, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov s2, w10
+; CHECK-BF16-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}
+
+define bfloat @test_fabs(bfloat %a) #0 {
+; CHECK-LABEL: test_fabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0x7fff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = call bfloat @llvm.fabs.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_minnum:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fminnm s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_minnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fminnm s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_maxnum:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_maxnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_f32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %tb = fptrunc float %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fcvt s1, d1
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_f64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fcvt s1, d1
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %tb = fptrunc double %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+; away the (fpext (fp_round <result>)) here.
+
+define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_extended:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_extended:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  %xr = fpext bfloat %r to float
+  ret float %xr
+}
+
+define bfloat @test_floor(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_floor:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_floor:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintm s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.floor.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_ceil(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_ceil:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_ceil:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintp s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.ceil.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_trunc(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_trunc:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_trunc:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintz s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.trunc.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_rint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_rint:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintx s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_rint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintx s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.rint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_nearbyint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_nearbyint:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frinti s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_nearbyint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frinti s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_round(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_round:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_round:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frinta s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.round.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_roundeven(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_roundeven:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintn s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_roundeven:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintn s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.roundeven.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmuladd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    fmov w9, s2
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmuladd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov w9, s2
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}
+
+attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
new file mode 100644
index 00000000000000..9b6e19eba3f4e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -0,0 +1,711 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @build_h4(<4 x bfloat> %a) {
+; CHECK-LABEL: build_h4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-NEXT:    dup v0.4h, w8
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x bfloat> <bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD>
+}
+
+
+define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @mul_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fmul <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @div_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fdiv <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <4 x bfloat>, ptr %a, align 4
+  ret <4 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <4 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  store <4 x bfloat> %b, ptr %a, align 4
+  ret void
+}
+
+define <4 x bfloat> @s_to_h(<4 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <4 x float> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @d_to_h(<4 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <4 x double> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x float> @h_to_s(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    ret
+  %1 = fpext <4 x bfloat> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ret
+  %1 = fpext <4 x bfloat> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x bfloat> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  %2 = bitcast <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  %2 = bitcast <4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @sitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %1 = insertelement <4 x bfloat> undef, bfloat %a, i64 0
+  store <4 x bfloat> %1, ptr %b, align 4
+  ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptosi<4 x bfloat> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptosi<4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+  %1 = fptoui<4 x bfloat> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptoui<4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i1> @test_fcmp_une(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp une <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ueq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ueq <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ugt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ugt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp uge <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ult(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ult <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ule(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ule <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uno(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp uno <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_one(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp one <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oeq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp oeq <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ogt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ogt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp oge <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_olt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp olt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ole(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ole <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ord(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ord <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
new file mode 100644
index 00000000000000..c03e2e5321321a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -0,0 +1,2192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <8 x bfloat> @add_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @sub_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fsub <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @mul_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fmul <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @div_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v3.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    fdiv v1.4s, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v3.4s, #1
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    fcmeq v4.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    add v3.4s, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v3.16b, v4.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v5.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fdiv <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x bfloat>, ptr %a, align 4
+  ret <8 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <8 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  store <8 x bfloat> %b, ptr %a, align 4
+  ret void
+}
+
+define <8 x bfloat> @s_to_h(<8 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v6.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    bit v1.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v2.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <8 x float> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @d_to_h(<8 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <8 x double> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x float> @h_to_s(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    ret
+  %1 = fpext <8 x bfloat> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtl v0.2d, v2.2s
+; CHECK-NEXT:    shll v4.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtl2 v1.2d, v2.4s
+; CHECK-NEXT:    fcvtl2 v3.2d, v4.4s
+; CHECK-NEXT:    fcvtl v2.2d, v4.2s
+; CHECK-NEXT:    ret
+  %1 = fpext <8 x bfloat> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+define <8 x bfloat> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = bitcast <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = bitcast <8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v4i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v4i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v8i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v8i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @sitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v16i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v16i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v4i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v4i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @uitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v8i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v8i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @uitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v16i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v16i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+  %1 = insertelement <8 x bfloat> undef, bfloat %a, i64 0
+  store <8 x bfloat> %1, ptr %b, align 4
+  ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %1 = fptosi<8 x bfloat> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %1 = fptosi<8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %1 = fptoui<8 x bfloat> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %1 = fptoui<8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ne
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ne
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp une <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    lsl w9, w11, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csinv w10, w10, wzr, vc
+; CHECK-NEXT:    fcmp s7, s6
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w11, s4
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h4, v0.h[4]
+; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    csinv w9, w9, wzr, vc
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    lsl w11, w11, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s5, w11
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[1], w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    lsl w8, w9, #16
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    csinv w10, w10, wzr, vc
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ueq <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, hi
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, hi
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ugt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, pl
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, pl
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp uge <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, lt
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, lt
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ult <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, le
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, le
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ule <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, vs
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, vs
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp uno <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    lsl w9, w11, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csinv w10, w10, wzr, le
+; CHECK-NEXT:    fcmp s7, s6
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w11, s4
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h4, v0.h[4]
+; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    csinv w9, w9, wzr, le
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    lsl w11, w11, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s5, w11
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[1], w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    lsl w8, w9, #16
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    csinv w10, w10, wzr, le
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp one <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp oeq <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, gt
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, gt
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ogt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ge
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ge
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp oge <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp olt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ls
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ls
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ole <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, vc
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, vc
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ord <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+attributes #0 = { nounwind }


        


More information about the llvm-commits mailing list