[llvm] 76db473 - [AArch64] Add bf16 instruction coverage. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 6 03:38:07 PST 2024
Author: David Green
Date: 2024-12-06T11:38:02Z
New Revision: 76db47335903cb65d3027c0a77658f488d8ce659
URL: https://github.com/llvm/llvm-project/commit/76db47335903cb65d3027c0a77658f488d8ce659
DIFF: https://github.com/llvm/llvm-project/commit/76db47335903cb65d3027c0a77658f488d8ce659.diff
LOG: [AArch64] Add bf16 instruction coverage. NFC
These are the same tests as fp16-instructions.ll, fp16-v4-instructions.ll and
fp16-v8-instruction.ll ported to bf16.
Added:
llvm/test/CodeGen/AArch64/bf16-instructions.ll
llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
new file mode 100644
index 00000000000000..33997614598c3a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -0,0 +1,2347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fadd:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fadd s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fadd:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fadd s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fadd bfloat %a, %b
+ ret bfloat %r
+}
+
+define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fsub:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fsub s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fsub:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fsub s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fsub bfloat %a, %b
+ ret bfloat %r
+}
+
+define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fmul:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fmul s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fmul:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fmul s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fmul bfloat %a, %b
+ ret bfloat %r
+}
+
+define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmadd:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fmul s0, s1, s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT: add w8, w8, w10
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: fmov w9, s2
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fadd s0, s0, s1
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT: add w8, w8, w10
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fmadd:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fmov w9, s2
+; CHECK-BF16-NEXT: fmul s0, s1, s0
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fadd s0, s0, s1
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %mul = fmul fast bfloat %a, %b
+ %r = fadd fast bfloat %mul, %c
+ ret bfloat %r
+}
+
+define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fdiv:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fdiv s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fdiv:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fdiv s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fdiv bfloat %a, %b
+ ret bfloat %r
+}
+
+define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_frem:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: bl fmodf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_frem:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bl fmodf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = frem bfloat %a, %b
+ ret bfloat %r
+}
+
+define void @test_store(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_store:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ store bfloat %a, ptr %b
+ ret void
+}
+
+define bfloat @test_load(ptr %a) #0 {
+; CHECK-LABEL: test_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: ret
+ %r = load bfloat, ptr %a
+ ret bfloat %r
+}
+
+declare bfloat @test_callee(bfloat %a, bfloat %b) #0
+
+define bfloat @test_call(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl test_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %r = call bfloat @test_callee(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_call_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call_flipped:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: fmov s2, s0
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: fmov s1, s2
+; CHECK-NEXT: bl test_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %r = call bfloat @test_callee(bfloat %b, bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_tailcall_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s2, s0
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: fmov s1, s2
+; CHECK-NEXT: b test_callee
+ %r = tail call bfloat @test_callee(bfloat %b, bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+ %r = select i1 %c, bfloat %a, bfloat %b
+ ret bfloat %r
+}
+
+define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fcmp s3, s2
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+ %cc = fcmp une bfloat %c, %d
+ %r = select i1 %cc, bfloat %a, bfloat %b
+ ret bfloat %r
+}
+
+define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fcmp s3, s2
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: ret
+ %cc = fcmp une bfloat %c, %d
+ %r = select i1 %cc, float %a, float %b
+ ret float %r
+}
+
+define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) #0 {
+; CHECK-LABEL: test_select_cc_f16_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcmp s2, s3
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+ %cc = fcmp une float %c, %d
+ %r = select i1 %cc, bfloat %a, bfloat %b
+ ret bfloat %r
+}
+
+define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %r = fcmp une bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w8, eq
+; CHECK-NEXT: csinc w0, w8, wzr, vc
+; CHECK-NEXT: ret
+ %r = fcmp ueq bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ret
+ %r = fcmp ugt bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, pl
+; CHECK-NEXT: ret
+ %r = fcmp uge bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+ %r = fcmp ult bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, le
+; CHECK-NEXT: ret
+ %r = fcmp ule bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, vs
+; CHECK-NEXT: ret
+ %r = fcmp uno bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w8, mi
+; CHECK-NEXT: csinc w0, w8, wzr, le
+; CHECK-NEXT: ret
+ %r = fcmp one bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %r = fcmp oeq bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %r = fcmp ogt bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: ret
+ %r = fcmp oge bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, mi
+; CHECK-NEXT: ret
+ %r = fcmp olt bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, ls
+; CHECK-NEXT: ret
+ %r = fcmp ole bfloat %a, %b
+ ret i1 %r
+}
+
+define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: cset w0, vc
+; CHECK-NEXT: ret
+ %r = fcmp ord bfloat %a, %b
+ ret i1 %r
+}
+
+define void @test_fccmp(bfloat %in, ptr %out) {
+; CHECK-LABEL: test_fccmp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: movi v1.2s, #69, lsl #24
+; CHECK-NEXT: movi v3.2s, #72, lsl #24
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: adrp x8, .LCPI29_0
+; CHECK-NEXT: fcmp s2, s1
+; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI29_0]
+; CHECK-NEXT: fccmp s2, s3, #4, mi
+; CHECK-NEXT: fcsel s0, s0, s1, gt
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+ %cmp1 = fcmp ogt bfloat %in, 0xR4800
+ %cmp2 = fcmp olt bfloat %in, 0xR4500
+ %cond = and i1 %cmp1, %cmp2
+ %result = select i1 %cond, bfloat %in, bfloat 0xR4500
+ store bfloat %result, ptr %out
+ ret void
+}
+
+define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
+; CHECK-LABEL: test_br_cc:
+; CHECK: // %bb.0: // %common.ret
+; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csel x8, x0, x1, pl
+; CHECK-NEXT: str wzr, [x8]
+; CHECK-NEXT: ret
+ %c = fcmp uge bfloat %a, %b
+ br i1 %c, label %then, label %else
+then:
+ store i32 0, ptr %p1
+ ret void
+else:
+ store i32 0, ptr %p2
+ ret void
+}
+
+define bfloat @test_phi(ptr %p1) #0 {
+; CHECK-LABEL: test_phi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: ldr h9, [x0]
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: .LBB31_1: // %loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: fmov s8, s9
+; CHECK-NEXT: ldr h9, [x19]
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl test_dummy
+; CHECK-NEXT: tbnz w0, #0, .LBB31_1
+; CHECK-NEXT: // %bb.2: // %return
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: fmov s0, s8
+; CHECK-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %a = load bfloat, ptr %p1
+ br label %loop
+loop:
+ %r = phi bfloat [%a, %entry], [%b, %loop]
+ %b = load bfloat, ptr %p1
+ %c = call i1 @test_dummy(ptr %p1)
+ br i1 %c, label %loop, label %return
+return:
+ ret bfloat %r
+}
+
+declare i1 @test_dummy(ptr %p1) #0
+
+define i32 @test_fptosi_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
+ %r = fptosi bfloat %a to i32
+ ret i32 %r
+}
+
+define i64 @test_fptosi_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzs x0, s0
+; CHECK-NEXT: ret
+ %r = fptosi bfloat %a to i64
+ ret i64 %r
+}
+
+define i32 @test_fptoui_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
+ %r = fptoui bfloat %a to i32
+ ret i32 %r
+}
+
+define i64 @test_fptoui_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvtzu x0, s0
+; CHECK-NEXT: ret
+ %r = fptoui bfloat %a to i64
+ ret i64 %r
+}
+
+define bfloat @test_uitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf d0, w0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: fcvtxn s0, d0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf d0, w0
+; CHECK-BF16-NEXT: fcvtxn s0, d0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = uitofp i32 %a to bfloat
+ ret bfloat %r
+}
+
+define bfloat @test_uitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: lsr x9, x0, #53
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: cmp x9, #0
+; CHECK-CVT-NEXT: and x9, x0, #0xfffffffffffff000
+; CHECK-CVT-NEXT: csel x9, x9, x0, ne
+; CHECK-CVT-NEXT: ucvtf d0, x9
+; CHECK-CVT-NEXT: cset w9, ne
+; CHECK-CVT-NEXT: tst x0, #0xfff
+; CHECK-CVT-NEXT: csel w9, wzr, w9, eq
+; CHECK-CVT-NEXT: fmov x10, d0
+; CHECK-CVT-NEXT: orr x9, x10, x9
+; CHECK-CVT-NEXT: fmov d0, x9
+; CHECK-CVT-NEXT: fcvtxn s0, d0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: lsr x8, x0, #53
+; CHECK-BF16-NEXT: and x9, x0, #0xfffffffffffff000
+; CHECK-BF16-NEXT: cmp x8, #0
+; CHECK-BF16-NEXT: csel x8, x9, x0, ne
+; CHECK-BF16-NEXT: ucvtf d0, x8
+; CHECK-BF16-NEXT: cset w8, ne
+; CHECK-BF16-NEXT: tst x0, #0xfff
+; CHECK-BF16-NEXT: csel w8, wzr, w8, eq
+; CHECK-BF16-NEXT: fmov x9, d0
+; CHECK-BF16-NEXT: orr x8, x9, x8
+; CHECK-BF16-NEXT: fmov d0, x8
+; CHECK-BF16-NEXT: fcvtxn s0, d0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = uitofp i64 %a to bfloat
+ ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf d0, w0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: fcvtxn s0, d0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf d0, w0
+; CHECK-BF16-NEXT: fcvtxn s0, d0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = sitofp i32 %a to bfloat
+ ret bfloat %r
+}
+
+define bfloat @test_sitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: cmp x0, #0
+; CHECK-CVT-NEXT: and x11, x0, #0x8000000000000000
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: cneg x9, x0, mi
+; CHECK-CVT-NEXT: lsr x10, x9, #53
+; CHECK-CVT-NEXT: cmp x10, #0
+; CHECK-CVT-NEXT: and x10, x9, #0xfffffffffffff000
+; CHECK-CVT-NEXT: csel x10, x10, x9, ne
+; CHECK-CVT-NEXT: scvtf d0, x10
+; CHECK-CVT-NEXT: cset w10, ne
+; CHECK-CVT-NEXT: tst x9, #0xfff
+; CHECK-CVT-NEXT: csel w10, wzr, w10, eq
+; CHECK-CVT-NEXT: fmov x9, d0
+; CHECK-CVT-NEXT: orr x9, x9, x11
+; CHECK-CVT-NEXT: orr x9, x9, x10
+; CHECK-CVT-NEXT: fmov d0, x9
+; CHECK-CVT-NEXT: fcvtxn s0, d0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: cmp x0, #0
+; CHECK-BF16-NEXT: cneg x8, x0, mi
+; CHECK-BF16-NEXT: lsr x9, x8, #53
+; CHECK-BF16-NEXT: and x10, x8, #0xfffffffffffff000
+; CHECK-BF16-NEXT: cmp x9, #0
+; CHECK-BF16-NEXT: csel x9, x10, x8, ne
+; CHECK-BF16-NEXT: and x10, x0, #0x8000000000000000
+; CHECK-BF16-NEXT: cset w11, ne
+; CHECK-BF16-NEXT: scvtf d0, x9
+; CHECK-BF16-NEXT: tst x8, #0xfff
+; CHECK-BF16-NEXT: fmov x9, d0
+; CHECK-BF16-NEXT: orr x8, x9, x10
+; CHECK-BF16-NEXT: csel w9, wzr, w11, eq
+; CHECK-BF16-NEXT: orr x8, x8, x9
+; CHECK-BF16-NEXT: fmov d0, x8
+; CHECK-BF16-NEXT: fcvtxn s0, d0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = sitofp i64 %a to bfloat
+ ret bfloat %r
+}
+
+define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32_fadd:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf d1, w0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fcvtxn s1, d1
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w9, w9, w8
+; CHECK-CVT-NEXT: add w9, w10, w9
+; CHECK-CVT-NEXT: lsr w9, w9, #16
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: fmov w10, s1
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fadd s0, s0, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf d1, w0
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fcvtxn s1, d1
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bfcvt h1, s1
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fadd s0, s0, s1
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %c = uitofp i32 %a to bfloat
+ %r = fadd bfloat %b, %c
+ ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32_fadd:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf d1, w0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fcvtxn s1, d1
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w9, w9, w8
+; CHECK-CVT-NEXT: add w9, w10, w9
+; CHECK-CVT-NEXT: lsr w9, w9, #16
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: fmov w10, s1
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fadd s0, s0, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf d1, w0
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fcvtxn s1, d1
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bfcvt h1, s1
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fadd s0, s0, s1
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %c = sitofp i32 %a to bfloat
+ %r = fadd bfloat %b, %c
+ ret bfloat %r
+}
+
+define bfloat @test_fptrunc_float(float %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_float:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: fcmp s0, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: orr w9, w9, #0x400000
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: csel w8, w9, w8, vs
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_float:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fptrunc float %a to bfloat
+ ret bfloat %r
+}
+
+define bfloat @test_fptrunc_double(double %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_double:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: fcvtxn s0, d0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_double:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: fcvtxn s0, d0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = fptrunc double %a to bfloat
+ ret bfloat %r
+}
+
+define float @test_fpext_float(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_float:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+ %r = fpext bfloat %a to float
+ ret float %r
+}
+
+define double @test_fpext_double(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_double:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fcvt d0, s0
+; CHECK-NEXT: ret
+ %r = fpext bfloat %a to double
+ ret double %r
+}
+
+define i16 @test_bitcast_bfloattoi16(bfloat %a) #0 {
+; CHECK-LABEL: test_bitcast_bfloattoi16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %r = bitcast bfloat %a to i16
+ ret i16 %r
+}
+
+define bfloat @test_bitcast_i16tobfloat(i16 %a) #0 {
+; CHECK-LABEL: test_bitcast_i16tobfloat:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+ %r = bitcast i16 %a to bfloat
+ ret bfloat %r
+}
+
+declare bfloat @llvm.sqrt.f16(bfloat %a) #0
+declare bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b) #0
+declare bfloat @llvm.sin.f16(bfloat %a) #0
+declare bfloat @llvm.cos.f16(bfloat %a) #0
+declare bfloat @llvm.tan.f16(bfloat %a) #0
+declare bfloat @llvm.asin.f16(bfloat %a) #0
+declare bfloat @llvm.acos.f16(bfloat %a) #0
+declare bfloat @llvm.atan.f16(bfloat %a) #0
+declare bfloat @llvm.atan2.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.sinh.f16(bfloat %a) #0
+declare bfloat @llvm.cosh.f16(bfloat %a) #0
+declare bfloat @llvm.tanh.f16(bfloat %a) #0
+declare bfloat @llvm.pow.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.exp.f16(bfloat %a) #0
+declare bfloat @llvm.exp2.f16(bfloat %a) #0
+declare bfloat @llvm.log.f16(bfloat %a) #0
+declare bfloat @llvm.log10.f16(bfloat %a) #0
+declare bfloat @llvm.log2.f16(bfloat %a) #0
+declare bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) #0
+declare bfloat @llvm.fabs.f16(bfloat %a) #0
+declare bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.floor.f16(bfloat %a) #0
+declare bfloat @llvm.ceil.f16(bfloat %a) #0
+declare bfloat @llvm.trunc.f16(bfloat %a) #0
+declare bfloat @llvm.rint.f16(bfloat %a) #0
+declare bfloat @llvm.nearbyint.f16(bfloat %a) #0
+declare bfloat @llvm.round.f16(bfloat %a) #0
+declare bfloat @llvm.roundeven.f16(bfloat %a) #0
+declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
+
+
+define bfloat @test_sqrt(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sqrt:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fsqrt s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sqrt:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fsqrt s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.sqrt.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_powi(bfloat %a, i32 %b) #0 {
+; CHECK-CVT-LABEL: test_powi:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl __powisf2
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_powi:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl __powisf2
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
+ ret bfloat %r
+}
+
+
+define bfloat @test_sin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sin:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl sinf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sin:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl sinf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.sin.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_cos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cos:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl cosf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_cos:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl cosf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.cos.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_tan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tan:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl tanf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_tan:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl tanf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.tan.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_acos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_acos:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl acosf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_acos:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl acosf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.acos.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_asin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_asin:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl asinf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_asin:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl asinf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.asin.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_atan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_atan:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl atanf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_atan:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl atanf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.atan.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_atan2:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: bl atan2f
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_atan2:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bl atan2f
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_cosh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cosh:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl coshf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_cosh:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl coshf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.cosh.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_sinh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sinh:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl sinhf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_sinh:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl sinhf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.sinh.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_tanh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tanh:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl tanhf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_tanh:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl tanhf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.tanh.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_pow:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: bl powf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_pow:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bl powf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_exp(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl expf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_exp:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl expf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.exp.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_exp2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp2:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl exp2f
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_exp2:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl exp2f
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.exp2.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_log(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl logf
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_log:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl logf
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.log.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_log10(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log10:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl log10f
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_log10:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl log10f
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.log10.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_log2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log2:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bl log2f
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_log2:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bl log2f
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.log2.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fma:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s2
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fmov s2, w10
+; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT: fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT: add w8, w8, w10
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fma:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s2
+; CHECK-BF16-NEXT: fmov w9, s1
+; CHECK-BF16-NEXT: fmov w10, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: lsl w10, w10, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fmov s2, w10
+; CHECK-BF16-NEXT: fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
+ ret bfloat %r
+}
+
+define bfloat @test_fabs(bfloat %a) #0 {
+; CHECK-LABEL: test_fabs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w8, w8, #0x7fff
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+ %r = call bfloat @llvm.fabs.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_minnum:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fminnm s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_minnum:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fminnm s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_maxnum:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s1
+; CHECK-CVT-NEXT: fmov w10, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: lsl w10, w10, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: fmov s1, w10
+; CHECK-CVT-NEXT: fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_maxnum:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_copysign:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+ ret bfloat %r
+}
+
+define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 def $q1
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_copysign_f32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 def $q1
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %tb = fptrunc float %b to bfloat
+ %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+ ret bfloat %r
+}
+
+define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: fcvt s1, d1
+; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_copysign_f64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: fcvt s1, d1
+; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %tb = fptrunc double %b to bfloat
+ %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+ ret bfloat %r
+}
+
+; away the (fpext (fp_round <result>)) here.
+
+define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_extended:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_copysign_extended:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+ %xr = fpext bfloat %r to float
+ ret float %xr
+}
+
+define bfloat @test_floor(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_floor:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frintm s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_floor:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frintm s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.floor.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_ceil(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_ceil:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frintp s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_ceil:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frintp s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.ceil.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_trunc(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_trunc:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frintz s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_trunc:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frintz s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.trunc.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_rint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_rint:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frintx s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_rint:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frintx s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.rint.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_nearbyint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_nearbyint:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frinti s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_nearbyint:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frinti s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_round(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_round:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frinta s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_round:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frinta s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.round.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_roundeven(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_roundeven:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w9
+; CHECK-CVT-NEXT: frintn s0, s0
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: add w8, w10, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_roundeven:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: frintn s0, s0
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.roundeven.f16(bfloat %a)
+ ret bfloat %r
+}
+
+define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmuladd:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT: fmov w8, s1
+; CHECK-CVT-NEXT: fmov w9, s0
+; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fmul s0, s1, s0
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT: add w8, w8, w10
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: fmov w9, s2
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: lsl w8, w8, #16
+; CHECK-CVT-NEXT: lsl w9, w9, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: fmov s1, w9
+; CHECK-CVT-NEXT: fadd s0, s0, s1
+; CHECK-CVT-NEXT: fmov w8, s0
+; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT: add w8, w8, w10
+; CHECK-CVT-NEXT: add w8, w9, w8
+; CHECK-CVT-NEXT: lsr w8, w8, #16
+; CHECK-CVT-NEXT: fmov s0, w8
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: test_fmuladd:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT: fmov w8, s1
+; CHECK-BF16-NEXT: fmov w9, s0
+; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: fmov w9, s2
+; CHECK-BF16-NEXT: fmul s0, s1, s0
+; CHECK-BF16-NEXT: lsl w9, w9, #16
+; CHECK-BF16-NEXT: fmov s1, w9
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: fmov w8, s0
+; CHECK-BF16-NEXT: lsl w8, w8, #16
+; CHECK-BF16-NEXT: fmov s0, w8
+; CHECK-BF16-NEXT: fadd s0, s0, s1
+; CHECK-BF16-NEXT: bfcvt h0, s0
+; CHECK-BF16-NEXT: ret
+ %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
+ ret bfloat %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
new file mode 100644
index 00000000000000..9b6e19eba3f4e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -0,0 +1,711 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+entry:
+
+ %0 = fadd <4 x bfloat> %a, %b
+ ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @build_h4(<4 x bfloat> %a) {
+; CHECK-LABEL: build_h4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #15565 // =0x3ccd
+; CHECK-NEXT: dup v0.4h, w8
+; CHECK-NEXT: ret
+entry:
+ ret <4 x bfloat> <bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD>
+}
+
+
+define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+entry:
+
+ %0 = fsub <4 x bfloat> %a, %b
+ ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @mul_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+entry:
+
+ %0 = fmul <4 x bfloat> %a, %b
+ ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @div_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+entry:
+
+ %0 = fdiv <4 x bfloat> %a, %b
+ ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load <4 x bfloat>, ptr %a, align 4
+ ret <4 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <4 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+entry:
+ store <4 x bfloat> %b, ptr %a, align 4
+ ret void
+}
+
+define <4 x bfloat> @s_to_h(<4 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = fptrunc <4 x float> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @d_to_h(<4 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = fptrunc <4 x double> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <4 x float> @h_to_s(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: ret
+ %1 = fpext <4 x bfloat> %a to <4 x float>
+ ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT: fcvtl v0.2d, v0.2s
+; CHECK-NEXT: ret
+ %1 = fpext <4 x bfloat> %a to <4 x double>
+ ret <4 x double> %1
+}
+
+define <4 x bfloat> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
+ %2 = bitcast <4 x i16> %a to <4 x bfloat>
+ ret <4 x bfloat> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
+ %2 = bitcast <4 x bfloat> %a to <4 x i16>
+ ret <4 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <4 x i8> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @sitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <4 x i16> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <4 x i32> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <4 x i64> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <4 x i8> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <4 x i16> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <4 x i32> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <4 x i64> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %1 = insertelement <4 x bfloat> undef, bfloat %a, i64 0
+ store <4 x bfloat> %1, ptr %b, align 4
+ ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %1 = fptosi<4 x bfloat> %a to <4 x i8>
+ ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %1 = fptosi<4 x bfloat> %a to <4 x i16>
+ ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+ %1 = fptoui<4 x bfloat> %a to <4 x i8>
+ ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %1 = fptoui<4 x bfloat> %a to <4 x i16>
+ ret <4 x i16> %1
+}
+
+define <4 x i1> @test_fcmp_une(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp une <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ueq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp ueq <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ugt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp ugt <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp uge <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ult(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp ult <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ule(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp ule <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uno(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: ret
+
+ %1 = fcmp uno <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_one(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp one <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oeq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp oeq <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ogt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp ogt <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp oge <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_olt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp olt <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ole(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp ole <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ord(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+
+ %1 = fcmp ord <4 x bfloat> %a, %b
+ ret <4 x i1> %1
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
new file mode 100644
index 00000000000000..c03e2e5321321a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -0,0 +1,2192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <8 x bfloat> @add_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT: shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fadd v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT: add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: mov v2.16b, v5.16b
+; CHECK-CVT-NEXT: bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT: fadd v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT: fadd v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT: ret
+entry:
+ %0 = fadd <8 x bfloat> %a, %b
+ ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @sub_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT: shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fsub v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT: add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: mov v2.16b, v5.16b
+; CHECK-CVT-NEXT: bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT: fsub v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT: fsub v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT: ret
+entry:
+ %0 = fsub <8 x bfloat> %a, %b
+ ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @mul_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT: shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT: fmul v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT: fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT: add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: mov v2.16b, v5.16b
+; CHECK-CVT-NEXT: bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT: fmul v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT: fmul v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT: ret
+entry:
+ %0 = fmul <8 x bfloat> %a, %b
+ ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @div_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT: // %bb.0: // %entry
+; CHECK-CVT-NEXT: shll2 v2.4s, v1.8h, #16
+; CHECK-CVT-NEXT: shll2 v3.4s, v0.8h, #16
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT: fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-CVT-NEXT: ushr v0.4s, v0.4s, #16
+; CHECK-CVT-NEXT: fdiv v1.4s, v3.4s, v1.4s
+; CHECK-CVT-NEXT: movi v3.4s, #1
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT: fcmeq v4.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT: add v3.4s, v2.4s, v0.4s
+; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT: fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT: bit v2.16b, v3.16b, v4.16b
+; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v5.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16: // %bb.0: // %entry
+; CHECK-BF16-NEXT: shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT: shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT: shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT: fdiv v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT: fdiv v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT: ret
+entry:
+ %0 = fdiv <8 x bfloat> %a, %b
+ ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = load <8 x bfloat>, ptr %a, align 4
+ ret <8 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <8 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+entry:
+ store <8 x bfloat> %b, ptr %a, align 4
+ ret void
+}
+
+define <8 x bfloat> @s_to_h(<8 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v4.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT: add v6.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT: fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT: bit v1.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT: bit v0.16b, v2.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = fptrunc <8 x float> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @d_to_h(<8 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: fcvtxn v2.2s, v2.2d
+; CHECK-CVT-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT: fcvtxn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: fcvtxn v2.2s, v2.2d
+; CHECK-BF16-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtxn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = fptrunc <8 x double> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <8 x float> @h_to_s(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: ret
+ %1 = fpext <8 x bfloat> %a to <8 x float>
+ ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v2.4s, v0.4h, #16
+; CHECK-NEXT: fcvtl v0.2d, v2.2s
+; CHECK-NEXT: shll v4.4s, v1.4h, #16
+; CHECK-NEXT: fcvtl2 v1.2d, v2.4s
+; CHECK-NEXT: fcvtl2 v3.2d, v4.4s
+; CHECK-NEXT: fcvtl v2.2d, v4.2s
+; CHECK-NEXT: ret
+ %1 = fpext <8 x bfloat> %a to <8 x double>
+ ret <8 x double> %1
+}
+
+
+define <8 x bfloat> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %2 = bitcast <8 x i16> %a to <8 x bfloat>
+ ret <8 x bfloat> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %2 = bitcast <8 x bfloat> %a to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v4i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_v4i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <4 x i8> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v8i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT: sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT: sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_v8i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <8 x i8> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @sitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v16i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: sshll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT: sshll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT: sshll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT: sshll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT: sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: scvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT: scvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT: scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: scvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT: ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT: and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT: and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT: add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT: addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT: add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT: add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT: addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_v16i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: sshll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT: sshll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT: sshll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: scvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT: mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <16 x i8> %a to <16 x bfloat>
+ ret <16 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT: sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT: scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <8 x i16> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: scvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT: and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <8 x i32> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: scvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT: scvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: scvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT: scvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = sitofp <8 x i64> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v4i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_v4i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <4 x i8> %a to <4 x bfloat>
+ ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @uitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v8i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_v8i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <8 x i8> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @uitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v16i8:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ushll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT: ushll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: ucvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT: ucvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT: ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: ucvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT: ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT: and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT: and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT: add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT: addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT: add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT: add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT: addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_v16i8:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: ucvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT: mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <16 x i8> %a to <16 x bfloat>
+ ret <16 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT: ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT: ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT: ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <8 x i16> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT: movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT: ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT: and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT: and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT: addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <8 x i32> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT: // %bb.0:
+; CHECK-CVT-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT: ret
+ %1 = uitofp <8 x i64> %a to <8 x bfloat>
+ ret <8 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %1 = insertelement <8 x bfloat> undef, bfloat %a, i64 0
+ store <8 x bfloat> %1, ptr %b, align 4
+ ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcvtzs v1.4s, v1.4s
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %1 = fptosi<8 x bfloat> %a to <8 x i8>
+ ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcvtzs v1.4s, v1.4s
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %1 = fptosi<8 x bfloat> %a to <8 x i16>
+ ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: ret
+ %1 = fptoui<8 x bfloat> %a to <8 x i8>
+ ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %1 = fptoui<8 x bfloat> %a to <8 x i16>
+ ret <8 x i16> %1
+}
+
+define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, ne
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, ne
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp une <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: lsl w9, w11, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s7, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: csetm w10, eq
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csinv w10, w10, wzr, vc
+; CHECK-NEXT: fcmp s7, s6
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w11, s4
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h4, v0.h[4]
+; CHECK-NEXT: mov h7, v1.h[5]
+; CHECK-NEXT: csetm w9, eq
+; CHECK-NEXT: csinv w9, w9, wzr, vc
+; CHECK-NEXT: fcmp s3, s2
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: fmov s2, w9
+; CHECK-NEXT: lsl w11, w11, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s5, w11
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[1], w10
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: csinv w8, w8, wzr, vc
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov s6, w10
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: fmov w9, s7
+; CHECK-NEXT: fmov w10, s5
+; CHECK-NEXT: csinv w8, w8, wzr, vc
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: fmov s6, w10
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: csinv w8, w8, wzr, vc
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: lsl w8, w9, #16
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: csetm w10, eq
+; CHECK-NEXT: csinv w10, w10, wzr, vc
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: csinv w8, w8, wzr, vc
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: csinv w8, w8, wzr, vc
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ueq <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, hi
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, hi
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, hi
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ugt <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, pl
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, pl
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, pl
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp uge <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, lt
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, lt
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, lt
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ult <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, le
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, le
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, le
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ule <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, vs
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, vs
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, vs
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp uno <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: lsl w9, w11, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s7, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: csetm w10, mi
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csinv w10, w10, wzr, le
+; CHECK-NEXT: fcmp s7, s6
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w11, s4
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h4, v0.h[4]
+; CHECK-NEXT: mov h7, v1.h[5]
+; CHECK-NEXT: csetm w9, mi
+; CHECK-NEXT: csinv w9, w9, wzr, le
+; CHECK-NEXT: fcmp s3, s2
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: fmov s2, w9
+; CHECK-NEXT: lsl w11, w11, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov s5, w11
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[1], w10
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: csinv w8, w8, wzr, le
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov s6, w10
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: fmov w9, s7
+; CHECK-NEXT: fmov w10, s5
+; CHECK-NEXT: csinv w8, w8, wzr, le
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: fmov s6, w10
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: csinv w8, w8, wzr, le
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: lsl w8, w9, #16
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: csetm w10, mi
+; CHECK-NEXT: csinv w10, w10, wzr, le
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: csinv w8, w8, wzr, le
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: csinv w8, w8, wzr, le
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp one <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, eq
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, eq
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp oeq <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, gt
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, gt
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, gt
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ogt <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, ge
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, ge
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, ge
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp oge <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, mi
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, mi
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp olt <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, ls
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, ls
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, ls
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ole <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: mov h2, v1.h[2]
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: fmov s5, w9
+; CHECK-NEXT: mov h4, v1.h[3]
+; CHECK-NEXT: lsl w10, w10, #16
+; CHECK-NEXT: fmov s6, w8
+; CHECK-NEXT: fmov w8, s2
+; CHECK-NEXT: csetm w9, vc
+; CHECK-NEXT: fmov s16, w10
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: mov h5, v0.h[3]
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: mov h6, v0.h[4]
+; CHECK-NEXT: mov h4, v1.h[5]
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: fmov s2, w8
+; CHECK-NEXT: fmov w8, s5
+; CHECK-NEXT: mov h5, v0.h[5]
+; CHECK-NEXT: fcmp s16, s7
+; CHECK-NEXT: mov v2.h[1], w9
+; CHECK-NEXT: lsl w9, w10, #16
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fmov w10, s3
+; CHECK-NEXT: fmov s3, w9
+; CHECK-NEXT: fmov w9, s6
+; CHECK-NEXT: fmov s7, w8
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: mov v2.h[2], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fmov w10, s4
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fcmp s7, s3
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: fmov w9, s5
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: mov v2.h[3], w8
+; CHECK-NEXT: lsl w8, w10, #16
+; CHECK-NEXT: fcmp s6, s4
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s5, w8
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: fmov s6, w9
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: mov v2.h[4], w8
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: fmov w9, s4
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: csetm w10, vc
+; CHECK-NEXT: fmov s3, w8
+; CHECK-NEXT: fmov s4, w9
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mov v2.h[5], w10
+; CHECK-NEXT: lsl w8, w8, #16
+; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: lsl w9, w9, #16
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: mov v2.h[6], w8
+; CHECK-NEXT: fcmp s1, s0
+; CHECK-NEXT: csetm w8, vc
+; CHECK-NEXT: mov v2.h[7], w8
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: ret
+ %1 = fcmp ord <8 x bfloat> %a, %b
+ ret <8 x i1> %1
+}
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list