[llvm] [AArch64] Don't try to vectorize fixed point to fp narrowing conversion (PR #130665)

Pranav Kant via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 20 14:36:20 PDT 2025


https://github.com/pranavk updated https://github.com/llvm/llvm-project/pull/130665

>From fe60f64f9dba35a778cc94d27671eb5fe22d4ff6 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Thu, 20 Mar 2025 21:34:14 +0000
Subject: [PATCH 1/2] init

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0db6c614684d7..3830a7a3d380a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5106,6 +5106,30 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   uint64_t VTSize = VT.getFixedSizeInBits();
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
+    // AArch64 doesn't have a direct vector instruction to convert
+    // fixed point to floating point AND narrow it at the same time.
+    // Additional rounding when the target is f32/f64 causes subtle
+    // differences across different platforms (that do have such
+    // instructions). Conversion to f16 however is fine.
+    bool IsTargetf32Orf64 = VT.getVectorElementType() == MVT::f32 ||
+                            VT.getVectorElementType() == MVT::f64;
+    bool IsTargetf16 = false;
+    if (Op.hasOneUse() && Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
+      // Some vector types are split during legalization into half, followed by
+      // concatenation, followed by rounding to the original vector type. If we
+      // end up resolving to f16 type, we shouldn't worry about rounding errors.
+      SDNode *U = *Op->user_begin();
+      if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
+        EVT TmpVT = U->user_begin()->getValueType(0);
+        if (TmpVT.isVector() && TmpVT.getVectorElementType() == MVT::f16)
+          IsTargetf16 = true;
+      }
+    }
+
+    if (IsTargetf32Orf64 && !IsTargetf16) {
+      return SDValue();
+    }
+
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());

>From e6a91ff4ba479bcb2df7e6bbcc5e2060f7faf1ff Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Thu, 20 Mar 2025 21:34:38 +0000
Subject: [PATCH 2/2] modify tests

---
 .../aarch64-neon-vector-insert-uaddlv.ll      |   75 +-
 .../CodeGen/AArch64/arm64-convert-v4f64.ll    |   37 +-
 .../CodeGen/AArch64/bf16-v4-instructions.ll   |  122 +-
 .../CodeGen/AArch64/bf16-v8-instructions.ll   |  261 +++-
 .../test/CodeGen/AArch64/complex-int-to-fp.ll |  177 ++-
 .../fold-int-pow2-with-fmul-or-fdiv.ll        |   19 +-
 .../CodeGen/AArch64/fp-intrinsics-vector.ll   |   51 +-
 llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll      |   49 +-
 llvm/test/CodeGen/AArch64/itofp-bf16.ll       | 1239 ++++++++++++-----
 llvm/test/CodeGen/AArch64/itofp.ll            |  922 +++++++++---
 .../AArch64/sve-fixed-length-int-to-fp.ll     |   53 +-
 llvm/test/CodeGen/AArch64/vector-fcvt.ll      |   98 +-
 12 files changed, 2368 insertions(+), 735 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index b357a24f892ff..aba45ebb89c34 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -146,11 +146,26 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0x000000ffffffff
 ; CHECK-NEXT:    str xzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.4s d1, v0
 ; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ushr.2d v1, v0, #32
+; CHECK-NEXT:    and.16b v0, v0, v2
+; CHECK-NEXT:    mov.d x8, v1[1]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    dup.2s v3, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v2[1], v1[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul.2s v2, v2, v3
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    fadd.2s v0, v2, v1
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -166,10 +181,25 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0x000000ffffffff
 ; CHECK-NEXT:    uaddlv.4s d1, v0
 ; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ushr.2d v1, v0, #32
+; CHECK-NEXT:    and.16b v0, v0, v2
+; CHECK-NEXT:    mov.d x8, v1[1]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    dup.2s v3, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v2[1], v1[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul.2s v2, v2, v3
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    fadd.2s v0, v2, v1
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -185,11 +215,26 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v5i64_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0x000000ffffffff
 ; CHECK-NEXT:    str wzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.4s d1, v0
 ; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ushr.2d v1, v0, #32
+; CHECK-NEXT:    and.16b v0, v0, v2
+; CHECK-NEXT:    mov.d x8, v1[1]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    dup.2s v3, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v2[1], v1[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul.2s v2, v2, v3
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    fadd.2s v0, v2, v1
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -250,12 +295,20 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v16i64_uaddlv_from_v4i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    movi.2d v2, #0000000000000000
-; CHECK-NEXT:    uaddlv.4h s1, v0
+; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT:    scvtf s3, xzr
+; CHECK-NEXT:    dup.2s v4, w9
+; CHECK-NEXT:    uaddlv.4h s2, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
-; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v1, v2
-; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    mov.s v1[0], v2[0]
+; CHECK-NEXT:    mov.d x8, v1[1]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.s v1[1], v2[0]
+; CHECK-NEXT:    fmul.2s v2, v4, v3[0]
+; CHECK-NEXT:    fadd.2s v1, v2, v1
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 508f68d6f14d4..c1653f3f45287 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -53,13 +53,40 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
 define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
 ; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v2.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v5.2d, v3.2d, #32
+; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov x9, v5.d[1]
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    fmov x8, d5
+; CHECK-NEXT:    mov v4.s[1], v2.s[0]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov v3.s[1], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov v4.s[2], v2.s[0]
 ; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    mov v3.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v4.s[3], v0.s[0]
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    mov v3.s[3], v1.s[0]
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    fmul v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 1cd0294b0083e..42da624a5f068 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
 define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: sitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    mov x8, v0.d[1]
+; CHECK-CVT-NEXT:    fmov x9, d0
+; CHECK-CVT-NEXT:    scvtf s2, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    fmov x8, d1
+; CHECK-CVT-NEXT:    scvtf s1, x8
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x9
+; CHECK-CVT-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: sitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov x8, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    scvtf s2, x9
+; CHECK-BF16-NEXT:    mov x9, v1.d[1]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    fmov x8, d1
+; CHECK-BF16-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i64> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -413,12 +427,39 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
 define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: uitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-CVT-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-CVT-NEXT:    ushr v4.2d, v1.2d, #32
+; CHECK-CVT-NEXT:    mov x8, v3.d[1]
+; CHECK-CVT-NEXT:    fmov x10, d3
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-CVT-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    scvtf s3, x10
+; CHECK-CVT-NEXT:    scvtf s5, x8
+; CHECK-CVT-NEXT:    fmov x8, d0
+; CHECK-CVT-NEXT:    mov x9, v0.d[1]
+; CHECK-CVT-NEXT:    scvtf s2, x8
+; CHECK-CVT-NEXT:    fmov x8, d4
+; CHECK-CVT-NEXT:    scvtf s0, x9
+; CHECK-CVT-NEXT:    mov x9, v4.d[1]
+; CHECK-CVT-NEXT:    mov v3.s[1], v5.s[0]
+; CHECK-CVT-NEXT:    scvtf s4, x8
+; CHECK-CVT-NEXT:    fmov x8, d1
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    mov x8, v1.d[1]
+; CHECK-CVT-NEXT:    scvtf s1, x9
+; CHECK-CVT-NEXT:    mov v3.s[2], v4.s[0]
+; CHECK-CVT-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-CVT-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-CVT-NEXT:    dup v1.4s, w8
+; CHECK-CVT-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT:    fmul v0.4s, v3.4s, v1.4s
 ; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
 ; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
@@ -431,10 +472,37 @@ define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
 ;
 ; CHECK-BF16-LABEL: uitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-BF16-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-BF16-NEXT:    ushr v4.2d, v1.2d, #32
+; CHECK-BF16-NEXT:    mov x8, v3.d[1]
+; CHECK-BF16-NEXT:    fmov x10, d3
+; CHECK-BF16-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-BF16-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    scvtf s3, x10
+; CHECK-BF16-NEXT:    scvtf s5, x8
+; CHECK-BF16-NEXT:    fmov x8, d0
+; CHECK-BF16-NEXT:    mov x9, v0.d[1]
+; CHECK-BF16-NEXT:    scvtf s2, x8
+; CHECK-BF16-NEXT:    fmov x8, d4
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    mov x9, v4.d[1]
+; CHECK-BF16-NEXT:    mov v3.s[1], v5.s[0]
+; CHECK-BF16-NEXT:    scvtf s4, x8
+; CHECK-BF16-NEXT:    fmov x8, d1
+; CHECK-BF16-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    mov x8, v1.d[1]
+; CHECK-BF16-NEXT:    scvtf s1, x9
+; CHECK-BF16-NEXT:    mov v3.s[2], v4.s[0]
+; CHECK-BF16-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-BF16-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-BF16-NEXT:    dup v1.4s, w8
+; CHECK-BF16-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    fmul v0.4s, v3.4s, v1.4s
+; CHECK-BF16-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i64> %a to <4 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 2eaa58de92807..e525ada5c9a61 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -489,45 +489,74 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
 define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: sitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fmov x10, d2
+; CHECK-CVT-NEXT:    mov x8, v2.d[1]
+; CHECK-CVT-NEXT:    mov x9, v0.d[1]
+; CHECK-CVT-NEXT:    scvtf s2, x10
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    scvtf s5, x9
+; CHECK-CVT-NEXT:    fmov x9, d3
+; CHECK-CVT-NEXT:    mov x8, v3.d[1]
+; CHECK-CVT-NEXT:    scvtf s4, x10
+; CHECK-CVT-NEXT:    fmov x10, d1
+; CHECK-CVT-NEXT:    scvtf s3, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x10
+; CHECK-CVT-NEXT:    scvtf s1, x8
+; CHECK-CVT-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-CVT-NEXT:    scvtf s3, x9
+; CHECK-CVT-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-CVT-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v1.16b, v5.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-CVT-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: sitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT:    mov x9, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x10, d0
+; CHECK-BF16-NEXT:    mov x8, v2.d[1]
+; CHECK-BF16-NEXT:    scvtf s4, x10
+; CHECK-BF16-NEXT:    fmov x10, d1
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    fmov x9, d2
+; CHECK-BF16-NEXT:    scvtf s2, x8
+; CHECK-BF16-NEXT:    mov x8, v1.d[1]
+; CHECK-BF16-NEXT:    scvtf s1, x9
+; CHECK-BF16-NEXT:    fmov x9, d3
+; CHECK-BF16-NEXT:    mov v4.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x10
+; CHECK-BF16-NEXT:    mov x10, v3.d[1]
+; CHECK-BF16-NEXT:    scvtf s3, x9
+; CHECK-BF16-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-BF16-NEXT:    scvtf s2, x8
+; CHECK-BF16-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x10
+; CHECK-BF16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-BF16-NEXT:    mov v4.s[3], v2.s[0]
+; CHECK-BF16-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v4.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -712,45 +741,147 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
 define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: uitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.2d, v2.2d, #32
+; CHECK-CVT-NEXT:    movi v4.2d, #0x000000ffffffff
+; CHECK-CVT-NEXT:    ushr v6.2d, v0.2d, #32
+; CHECK-CVT-NEXT:    ushr v7.2d, v3.2d, #32
+; CHECK-CVT-NEXT:    ushr v16.2d, v1.2d, #32
+; CHECK-CVT-NEXT:    fmov x10, d5
+; CHECK-CVT-NEXT:    mov x8, v5.d[1]
+; CHECK-CVT-NEXT:    mov x9, v6.d[1]
+; CHECK-CVT-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-CVT-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-CVT-NEXT:    fmov x12, d7
+; CHECK-CVT-NEXT:    mov x11, v7.d[1]
+; CHECK-CVT-NEXT:    scvtf s4, x10
+; CHECK-CVT-NEXT:    fmov x10, d6
+; CHECK-CVT-NEXT:    mov x13, v2.d[1]
+; CHECK-CVT-NEXT:    scvtf s5, x8
+; CHECK-CVT-NEXT:    mov x8, v0.d[1]
+; CHECK-CVT-NEXT:    scvtf s7, x9
+; CHECK-CVT-NEXT:    scvtf s17, x12
+; CHECK-CVT-NEXT:    fmov x12, d16
+; CHECK-CVT-NEXT:    scvtf s6, x10
+; CHECK-CVT-NEXT:    fmov x10, d2
+; CHECK-CVT-NEXT:    mov x9, v16.d[1]
+; CHECK-CVT-NEXT:    scvtf s16, x13
+; CHECK-CVT-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT:    fmov x13, d1
+; CHECK-CVT-NEXT:    scvtf s2, x10
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    scvtf s0, x12
+; CHECK-CVT-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-CVT-NEXT:    scvtf s7, x8
+; CHECK-CVT-NEXT:    mov x8, v3.d[1]
+; CHECK-CVT-NEXT:    mov x12, v1.d[1]
+; CHECK-CVT-NEXT:    scvtf s5, x10
+; CHECK-CVT-NEXT:    fmov x10, d3
+; CHECK-CVT-NEXT:    scvtf s3, x11
+; CHECK-CVT-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-CVT-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-CVT-NEXT:    scvtf s16, x13
+; CHECK-CVT-NEXT:    mov v6.s[2], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x9
+; CHECK-CVT-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-CVT-NEXT:    scvtf s1, x10
+; CHECK-CVT-NEXT:    mov v5.s[1], v7.s[0]
+; CHECK-CVT-NEXT:    dup v7.4s, w9
+; CHECK-CVT-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT:    scvtf s3, x12
+; CHECK-CVT-NEXT:    mov v6.s[3], v0.s[0]
+; CHECK-CVT-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT:    scvtf s1, x8
+; CHECK-CVT-NEXT:    mov v5.s[2], v16.s[0]
+; CHECK-CVT-NEXT:    fmul v0.4s, v4.4s, v7.4s
+; CHECK-CVT-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT:    fmul v1.4s, v6.4s, v7.4s
+; CHECK-CVT-NEXT:    mov v5.s[3], v3.s[0]
 ; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    fadd v1.4s, v1.4s, v5.4s
+; CHECK-CVT-NEXT:    ushr v4.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmeq v7.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v0.4s, v0.4s
 ; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v1.16b, v2.16b, v7.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: uitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    movi v4.2d, #0x000000ffffffff
+; CHECK-BF16-NEXT:    ushr v6.2d, v2.2d, #32
+; CHECK-BF16-NEXT:    ushr v5.2d, v0.2d, #32
+; CHECK-BF16-NEXT:    ushr v7.2d, v1.2d, #32
+; CHECK-BF16-NEXT:    mov x9, v6.d[1]
+; CHECK-BF16-NEXT:    fmov x10, d6
+; CHECK-BF16-NEXT:    mov x8, v5.d[1]
+; CHECK-BF16-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-BF16-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-BF16-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-BF16-NEXT:    scvtf s6, x10
+; CHECK-BF16-NEXT:    fmov x10, d5
+; CHECK-BF16-NEXT:    mov x11, v0.d[1]
+; CHECK-BF16-NEXT:    scvtf s17, x9
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    scvtf s16, x8
+; CHECK-BF16-NEXT:    mov x8, v2.d[1]
+; CHECK-BF16-NEXT:    scvtf s5, x10
+; CHECK-BF16-NEXT:    mov x10, v7.d[1]
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    fmov x9, d2
+; CHECK-BF16-NEXT:    scvtf s2, x11
+; CHECK-BF16-NEXT:    fmov x11, d7
+; CHECK-BF16-NEXT:    ushr v7.2d, v3.2d, #32
+; CHECK-BF16-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-BF16-NEXT:    mov v6.s[1], v17.s[0]
+; CHECK-BF16-NEXT:    mov v5.s[1], v16.s[0]
+; CHECK-BF16-NEXT:    scvtf s17, x9
+; CHECK-BF16-NEXT:    scvtf s16, x8
+; CHECK-BF16-NEXT:    scvtf s4, x11
+; CHECK-BF16-NEXT:    fmov x9, d7
+; CHECK-BF16-NEXT:    fmov x11, d1
+; CHECK-BF16-NEXT:    mov x8, v7.d[1]
+; CHECK-BF16-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-BF16-NEXT:    scvtf s7, x9
+; CHECK-BF16-NEXT:    scvtf s2, x11
+; CHECK-BF16-NEXT:    mov x9, v1.d[1]
+; CHECK-BF16-NEXT:    mov v5.s[2], v4.s[0]
+; CHECK-BF16-NEXT:    scvtf s1, x10
+; CHECK-BF16-NEXT:    fmov x10, d3
+; CHECK-BF16-NEXT:    mov x11, v3.d[1]
+; CHECK-BF16-NEXT:    mov v17.s[1], v16.s[0]
+; CHECK-BF16-NEXT:    scvtf s4, x8
+; CHECK-BF16-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-BF16-NEXT:    scvtf s3, x10
+; CHECK-BF16-NEXT:    mov v6.s[2], v7.s[0]
+; CHECK-BF16-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-BF16-NEXT:    scvtf s2, x9
+; CHECK-BF16-NEXT:    mov v5.s[3], v1.s[0]
+; CHECK-BF16-NEXT:    dup v1.4s, w8
+; CHECK-BF16-NEXT:    mov v17.s[2], v3.s[0]
+; CHECK-BF16-NEXT:    scvtf s3, x11
+; CHECK-BF16-NEXT:    mov v6.s[3], v4.s[0]
+; CHECK-BF16-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-BF16-NEXT:    fmul v2.4s, v5.4s, v1.4s
+; CHECK-BF16-NEXT:    mov v17.s[3], v3.s[0]
+; CHECK-BF16-NEXT:    fmul v1.4s, v6.4s, v1.4s
+; CHECK-BF16-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-BF16-NEXT:    fadd v1.4s, v1.4s, v17.4s
 ; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
index 506e5e59a3529..4c24f219bdd46 100644
--- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -1,9 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
 define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
+; CHECK-LABEL: autogen_SD19655:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    str d1, [x1]
+; CHECK-NEXT:    ret
   %T = load <2 x i64>, ptr %addr
   %F = sitofp <2 x i64> %T to <2 x float>
   store <2 x float> %F, ptr %addrfloat
@@ -12,38 +20,44 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
 
 define <2 x double> @test_signed_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i32_to_v2f64:
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], v0, #0
-; CHECK-NEXT: scvtf.2d v0, [[VAL64]]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    scvtf.2d v0, v0
+; CHECK-NEXT:    ret
   %conv = sitofp <2 x i32> %v to <2 x double>
   ret <2 x double> %conv
 }
 
 define <2 x double> @test_unsigned_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i32_to_v2f64
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], v0, #0
-; CHECK-NEXT: ucvtf.2d v0, [[VAL64]]
-; CHECK-NEXT: ret
+; CHECK-LABEL: test_unsigned_v2i32_to_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ucvtf.2d v0, v0
+; CHECK-NEXT:    ret
   %conv = uitofp <2 x i32> %v to <2 x double>
   ret <2 x double> %conv
 }
 
 define <2 x double> @test_signed_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i16_to_v2f64:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: scvtf.2d v0, [[VAL64]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl.2s v0, v0, #16
+; CHECK-NEXT:    sshr.2s v0, v0, #16
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    scvtf.2d v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i16> %v to <2 x double>
   ret <2 x double> %conv
 }
 define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i16_to_v2f64
-; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: ucvtf.2d v0, [[VAL64]]
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ucvtf.2d v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i16> %v to <2 x double>
   ret <2 x double> %conv
@@ -51,20 +65,24 @@ define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnon
 
 define <2 x double> @test_signed_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i8_to_v2f64:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: scvtf.2d v0, [[VAL64]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl.2s v0, v0, #24
+; CHECK-NEXT:    sshr.2s v0, v0, #24
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    scvtf.2d v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i8> %v to <2 x double>
   ret <2 x double> %conv
 }
 define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i8_to_v2f64
-; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: ucvtf.2d v0, [[VAL64]]
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ucvtf.2d v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i8> %v to <2 x double>
   ret <2 x double> %conv
@@ -72,16 +90,39 @@ define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone
 
 define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i64_to_v2f32:
-; CHECK: scvtf.2d [[VAL64:v[0-9]+]], v0
-; CHECK: fcvtn v0.2s, [[VAL64]].2d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i64> %v to <2 x float>
   ret <2 x float> %conv
 }
 define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i64_to_v2f32
-; CHECK: ucvtf.2d [[VAL64:v[0-9]+]], v0
-; CHECK: fcvtn v0.2s, [[VAL64]].2d
+; CHECK-LABEL: test_unsigned_v2i64_to_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi.2d v1, #0x000000ffffffff
+; CHECK-NEXT:    ushr.2d v2, v0, #32
+; CHECK-NEXT:    mov.d x8, v2[1]
+; CHECK-NEXT:    fmov x9, d2
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    dup.2s v3, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v2[1], v1[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul.2s v2, v2, v3
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    fadd.2s v0, v2, v1
+; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i64> %v to <2 x float>
   ret <2 x float> %conv
@@ -89,18 +130,22 @@ define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone
 
 define <2 x float> @test_signed_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i16_to_v2f32:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
-; CHECK: scvtf.2s v0, [[VAL32]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl.2s v0, v0, #16
+; CHECK-NEXT:    sshr.2s v0, v0, #16
+; CHECK-NEXT:    scvtf.2s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i16> %v to <2 x float>
   ret <2 x float> %conv
 }
 define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i16_to_v2f32
-; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ucvtf.2s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    ucvtf.2s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i16> %v to <2 x float>
   ret <2 x float> %conv
@@ -108,18 +153,22 @@ define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone
 
 define <2 x float> @test_signed_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i8_to_v2f32:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
-; CHECK: scvtf.2s v0, [[VAL32]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl.2s v0, v0, #24
+; CHECK-NEXT:    sshr.2s v0, v0, #24
+; CHECK-NEXT:    scvtf.2s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i8> %v to <2 x float>
   ret <2 x float> %conv
 }
 define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i8_to_v2f32
-; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ucvtf.2s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    ucvtf.2s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i8> %v to <2 x float>
   ret <2 x float> %conv
@@ -127,17 +176,21 @@ define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
 
 define <4 x float> @test_signed_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v4i16_to_v4f32:
-; CHECK: sshll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: scvtf.4s v0, [[VAL32]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    scvtf.4s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <4 x i16> %v to <4 x float>
   ret <4 x float> %conv
 }
 
 define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v4i16_to_v4f32
-; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: ucvtf.4s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v4i16_to_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ucvtf.4s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <4 x i16> %v to <4 x float>
   ret <4 x float> %conv
@@ -145,19 +198,23 @@ define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone
 
 define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v4i8_to_v4f32:
-; CHECK: shl.4h [[TMP:v[0-9]+]], v0, #8
-; CHECK: sshr.4h [[VAL16:v[0-9]+]], [[TMP]], #8
-; CHECK: sshll.4s [[VAL32:v[0-9]+]], [[VAL16]], #0
-; CHECK: scvtf.4s v0, [[VAL32]]
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl.4h v0, v0, #8
+; CHECK-NEXT:    sshr.4h v0, v0, #8
+; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    scvtf.4s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = sitofp <4 x i8> %v to <4 x float>
   ret <4 x float> %conv
 }
 define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
-; CHECK: bic.4h v0, #255, lsl #8
-; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: ucvtf.4s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v4i8_to_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ucvtf.4s v0, v0
+; CHECK-NEXT:    ret
 
   %conv = uitofp <4 x i8> %v to <4 x float>
   ret <4 x float> %conv
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index b40c0656a60e4..cef84c370b290 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -260,11 +260,26 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-NEON-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; CHECK-NEON:       // %bb.0:
 ; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
+; CHECK-NEON-NEXT:    movi v2.2d, #0x000000ffffffff
 ; CHECK-NEON-NEXT:    dup v1.2d, x8
 ; CHECK-NEON-NEXT:    ushl v0.2d, v1.2d, v0.2d
+; CHECK-NEON-NEXT:    ushr v1.2d, v0.2d, #32
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    mov x8, v1.d[1]
+; CHECK-NEON-NEXT:    fmov x9, d1
+; CHECK-NEON-NEXT:    scvtf s2, x9
+; CHECK-NEON-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEON-NEXT:    scvtf s1, x8
+; CHECK-NEON-NEXT:    mov x8, v0.d[1]
+; CHECK-NEON-NEXT:    dup v3.2s, w9
+; CHECK-NEON-NEXT:    fmov x9, d0
+; CHECK-NEON-NEXT:    scvtf s0, x8
+; CHECK-NEON-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NEON-NEXT:    scvtf s1, x9
+; CHECK-NEON-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEON-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEON-NEXT:    fadd v0.2s, v2.2s, v1.2s
 ; CHECK-NEON-NEXT:    fmov v1.2s, #15.00000000
-; CHECK-NEON-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEON-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index 83e60c1089762..1364c47adff2d 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -193,10 +193,17 @@ define <4 x float> @uitofp_v4f32_v4i32(<4 x i32> %x) #0 {
 define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; CHECK-LABEL: sitofp_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    ret
   %val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret <4 x float> %val
@@ -205,10 +212,38 @@ define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 define <4 x float> @uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; CHECK-LABEL: uitofp_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v3.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v4.2d, v0.2d, #32
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    mov x9, v4.d[1]
+; CHECK-NEXT:    fmov x10, d3
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    fmov x11, d4
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    mov x10, v1.d[1]
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    scvtf s4, x11
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    dup v1.2s, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-NEXT:    scvtf s6, x11
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov v6.s[1], v0.s[0]
+; CHECK-NEXT:    fmul v0.2s, v2.2s, v1.2s
+; CHECK-NEXT:    fmul v1.2s, v4.2s, v1.2s
+; CHECK-NEXT:    mov v5.s[1], v3.s[0]
+; CHECK-NEXT:    fadd v2.2s, v0.2s, v6.2s
+; CHECK-NEXT:    fadd v0.2s, v1.2s, v5.2s
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    ret
   %val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret <4 x float> %val
diff --git a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
index 0a7319b9ce11e..27499b8940ff2 100644
--- a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
+++ b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
@@ -210,15 +210,20 @@ define <1 x float> @scvtf_f32i64_simple(<1 x i64> %x) {
 ; CHECK-LABEL: scvtf_f32i64_simple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    scvtf s0, d0
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: scvtf_f32i64_simple:
 ; CHECK-NO-FPRCVT:       // %bb.0:
 ; CHECK-NO-FPRCVT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT:    fmov x8, d0
+; CHECK-NO-FPRCVT-NEXT:    movi d1, #0000000000000000
+; CHECK-NO-FPRCVT-NEXT:    scvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT:    fmov d0, d1
 ; CHECK-NO-FPRCVT-NEXT:    ret
  %conv = sitofp <1 x i64> %x to <1 x float>
  ret <1 x float> %conv
@@ -426,15 +431,43 @@ define <1 x float> @ucvtf_f32i64_simple(<1 x i64> %x) {
 ; CHECK-LABEL: ucvtf_f32i64_simple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v2.2d, v0.2d, #32
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    dup v3.2s, w9
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    scvtf s2, d2
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    scvtf s0, d0
+; CHECK-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    fadd v0.2s, v2.2s, v0.2s
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: ucvtf_f32i64_simple:
 ; CHECK-NO-FPRCVT:       // %bb.0:
 ; CHECK-NO-FPRCVT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NO-FPRCVT-NEXT:    ushr v2.2d, v0.2d, #32
+; CHECK-NO-FPRCVT-NEXT:    mov x8, v2.d[1]
+; CHECK-NO-FPRCVT-NEXT:    fmov x9, d2
+; CHECK-NO-FPRCVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NO-FPRCVT-NEXT:    scvtf s2, x9
+; CHECK-NO-FPRCVT-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NO-FPRCVT-NEXT:    scvtf s1, x8
+; CHECK-NO-FPRCVT-NEXT:    mov x8, v0.d[1]
+; CHECK-NO-FPRCVT-NEXT:    dup v3.2s, w9
+; CHECK-NO-FPRCVT-NEXT:    fmov x9, d0
+; CHECK-NO-FPRCVT-NEXT:    scvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NO-FPRCVT-NEXT:    scvtf s1, x9
+; CHECK-NO-FPRCVT-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-NO-FPRCVT-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT:    fadd v0.2s, v2.2s, v1.2s
 ; CHECK-NO-FPRCVT-NEXT:    ret
  %conv = uitofp <1 x i64> %x to <1 x float>
  ret <1 x float> %conv
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
index 58591b11c184f..9754a95dbcc16 100644
--- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -349,22 +349,27 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-LABEL: stofp_v3i64_v3bf16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v3.s[1], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v3.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v3.4s, #16
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    fcmeq v2.4s, v3.4s, v3.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -378,13 +383,40 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    ushr v5.2d, v2.2d, #32
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    fmov x10, d3
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov x9, v5.d[1]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d5
+; CHECK-NEXT:    mov v4.s[1], v3.s[0]
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v4.s[2], v3.s[0]
+; CHECK-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v4.s[3], v1.s[0]
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    fmul v0.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
@@ -402,19 +434,26 @@ entry:
 define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK-LABEL: stofp_v4i64_v4bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -425,12 +464,39 @@ entry:
 define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK-LABEL: utofp_v4i64_v4bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-NEXT:    ushr v4.2d, v1.2d, #32
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    fmov x10, d3
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    scvtf s3, x10
+; CHECK-NEXT:    scvtf s5, x8
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d4
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov x9, v4.d[1]
+; CHECK-NEXT:    mov v3.s[1], v5.s[0]
+; CHECK-NEXT:    scvtf s4, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v3.s[2], v4.s[0]
+; CHECK-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    fmul v0.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
@@ -448,31 +514,46 @@ entry:
 define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-LABEL: stofp_v8i64_v8bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    fmov x9, d3
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    scvtf s3, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    scvtf s3, x9
+; CHECK-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <8 x i64> %a to <8 x bfloat>
@@ -482,31 +563,82 @@ entry:
 define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-LABEL: utofp_v8i64_v8bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ushr v5.2d, v2.2d, #32
+; CHECK-NEXT:    movi v4.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v6.2d, v0.2d, #32
+; CHECK-NEXT:    ushr v7.2d, v3.2d, #32
+; CHECK-NEXT:    ushr v16.2d, v1.2d, #32
+; CHECK-NEXT:    fmov x10, d5
+; CHECK-NEXT:    mov x8, v5.d[1]
+; CHECK-NEXT:    mov x9, v6.d[1]
+; CHECK-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT:    fmov x12, d7
+; CHECK-NEXT:    mov x11, v7.d[1]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    fmov x10, d6
+; CHECK-NEXT:    mov x13, v2.d[1]
+; CHECK-NEXT:    scvtf s5, x8
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    scvtf s17, x12
+; CHECK-NEXT:    fmov x12, d16
+; CHECK-NEXT:    scvtf s6, x10
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x9, v16.d[1]
+; CHECK-NEXT:    scvtf s16, x13
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    fmov x13, d1
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    scvtf s0, x12
+; CHECK-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-NEXT:    scvtf s7, x8
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    mov x12, v1.d[1]
+; CHECK-NEXT:    scvtf s5, x10
+; CHECK-NEXT:    fmov x10, d3
+; CHECK-NEXT:    scvtf s3, x11
+; CHECK-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-NEXT:    scvtf s16, x13
+; CHECK-NEXT:    mov v6.s[2], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov v5.s[1], v7.s[0]
+; CHECK-NEXT:    dup v7.4s, w9
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    scvtf s3, x12
+; CHECK-NEXT:    mov v6.s[3], v0.s[0]
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v5.s[2], v16.s[0]
+; CHECK-NEXT:    fmul v0.4s, v4.4s, v7.4s
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    fmul v1.4s, v6.4s, v7.4s
+; CHECK-NEXT:    mov v5.s[3], v3.s[0]
 ; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    ushr v4.4s, v0.4s, #16
+; CHECK-NEXT:    add v6.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ushr v5.4s, v1.4s, #16
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    fcmeq v7.4s, v1.4s, v1.4s
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-NEXT:    fcmeq v5.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    bit v0.16b, v4.16b, v5.16b
+; CHECK-NEXT:    bit v1.16b, v2.16b, v7.16b
+; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <8 x i64> %a to <8 x bfloat>
@@ -516,55 +648,82 @@ entry:
 define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: stofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x12, v6.d[1]
+; CHECK-NEXT:    scvtf s2, x11
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    scvtf s16, x8
+; CHECK-NEXT:    fmov x8, d6
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    scvtf s6, x12
+; CHECK-NEXT:    fmov x12, d4
+; CHECK-NEXT:    scvtf s4, x11
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    mov v0.s[1], v16.s[0]
+; CHECK-NEXT:    scvtf s18, x10
+; CHECK-NEXT:    scvtf s19, x12
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    mov v2.s[1], v17.s[0]
+; CHECK-NEXT:    mov x12, v5.d[1]
+; CHECK-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-NEXT:    scvtf s6, x11
+; CHECK-NEXT:    fmov x11, d5
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov x10, v7.d[1]
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    mov v19.s[1], v18.s[0]
+; CHECK-NEXT:    scvtf s16, x8
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    scvtf s5, x11
+; CHECK-NEXT:    mov v3.s[2], v6.s[0]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x12
+; CHECK-NEXT:    mov v0.s[3], v7.s[0]
+; CHECK-NEXT:    mov v19.s[2], v5.s[0]
+; CHECK-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-NEXT:    mov v3.s[3], v4.s[0]
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    mov v19.s[3], v1.s[0]
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v3.4s, #16
+; CHECK-NEXT:    add v17.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v20.4s, v3.4s, v4.4s
+; CHECK-NEXT:    ushr v16.4s, v19.4s, #16
 ; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v16.16b, v1.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    fcmeq v16.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    fcmeq v18.4s, v3.4s, v3.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    add v7.4s, v7.4s, v20.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    fcmeq v4.4s, v19.4s, v19.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    orr v19.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v5.16b, v16.16b
+; CHECK-NEXT:    bit v2.16b, v6.16b, v17.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v18.16b
+; CHECK-NEXT:    bif v1.16b, v19.16b, v4.16b
+; CHECK-NEXT:    uzp2 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <16 x i64> %a to <16 x bfloat>
@@ -574,55 +733,154 @@ entry:
 define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: utofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    movi v16.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v18.2d, v2.2d, #32
+; CHECK-NEXT:    ushr v22.2d, v0.2d, #32
+; CHECK-NEXT:    ushr v19.2d, v3.2d, #32
+; CHECK-NEXT:    ushr v21.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v20.2d, v6.2d, #32
+; CHECK-NEXT:    mov x8, v18.d[1]
+; CHECK-NEXT:    fmov x9, d18
+; CHECK-NEXT:    mov x11, v22.d[1]
+; CHECK-NEXT:    and v17.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v23.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v25.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v16.16b
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    fmov x9, d19
+; CHECK-NEXT:    mov x10, v17.d[1]
+; CHECK-NEXT:    fmov x12, d17
+; CHECK-NEXT:    mov x13, v23.d[1]
+; CHECK-NEXT:    scvtf s24, x8
+; CHECK-NEXT:    mov x8, v19.d[1]
+; CHECK-NEXT:    fmov x14, d18
+; CHECK-NEXT:    scvtf s19, x9
+; CHECK-NEXT:    mov x9, v25.d[1]
+; CHECK-NEXT:    scvtf s0, x12
+; CHECK-NEXT:    fmov x12, d23
+; CHECK-NEXT:    scvtf s17, x10
+; CHECK-NEXT:    fmov x10, d22
+; CHECK-NEXT:    scvtf s22, x11
+; CHECK-NEXT:    scvtf s23, x13
+; CHECK-NEXT:    mov v2.s[1], v24.s[0]
+; CHECK-NEXT:    mov x11, v6.d[1]
+; CHECK-NEXT:    scvtf s1, x12
+; CHECK-NEXT:    mov x12, v20.d[1]
+; CHECK-NEXT:    mov x13, v18.d[1]
+; CHECK-NEXT:    scvtf s3, x10
+; CHECK-NEXT:    fmov x10, d25
+; CHECK-NEXT:    mov v0.s[1], v17.s[0]
+; CHECK-NEXT:    mov v2.s[2], v19.s[0]
+; CHECK-NEXT:    scvtf s19, x8
+; CHECK-NEXT:    mov x8, v21.d[1]
+; CHECK-NEXT:    scvtf s17, x10
+; CHECK-NEXT:    fmov x10, d21
+; CHECK-NEXT:    mov v1.s[1], v23.s[0]
+; CHECK-NEXT:    mov v3.s[1], v22.s[0]
+; CHECK-NEXT:    ushr v22.2d, v4.2d, #32
+; CHECK-NEXT:    scvtf s23, x9
+; CHECK-NEXT:    fmov x9, d6
+; CHECK-NEXT:    and v6.16b, v4.16b, v16.16b
+; CHECK-NEXT:    ushr v21.2d, v7.2d, #32
+; CHECK-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-NEXT:    scvtf s19, x11
+; CHECK-NEXT:    and v7.16b, v7.16b, v16.16b
+; CHECK-NEXT:    mov v0.s[2], v17.s[0]
+; CHECK-NEXT:    scvtf s17, x10
+; CHECK-NEXT:    fmov x10, d20
+; CHECK-NEXT:    scvtf s4, x9
+; CHECK-NEXT:    mov x9, v22.d[1]
+; CHECK-NEXT:    scvtf s20, x12
+; CHECK-NEXT:    mov x11, v21.d[1]
+; CHECK-NEXT:    fmov x12, d21
+; CHECK-NEXT:    scvtf s21, x10
+; CHECK-NEXT:    mov x10, v6.d[1]
+; CHECK-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-NEXT:    mov v0.s[3], v23.s[0]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    fmov x9, d22
+; CHECK-NEXT:    mov v4.s[1], v19.s[0]
+; CHECK-NEXT:    scvtf s18, x12
+; CHECK-NEXT:    fmov x12, d7
+; CHECK-NEXT:    scvtf s22, x10
+; CHECK-NEXT:    mov w10, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v21.s[1], v20.s[0]
+; CHECK-NEXT:    scvtf s19, x9
+; CHECK-NEXT:    fmov x9, d6
+; CHECK-NEXT:    ushr v6.2d, v5.2d, #32
+; CHECK-NEXT:    and v5.16b, v5.16b, v16.16b
+; CHECK-NEXT:    dup v16.4s, w10
+; CHECK-NEXT:    scvtf s20, x9
+; CHECK-NEXT:    fmov x10, d6
+; CHECK-NEXT:    mov x9, v6.d[1]
+; CHECK-NEXT:    mov v19.s[1], v17.s[0]
+; CHECK-NEXT:    mov v21.s[2], v18.s[0]
+; CHECK-NEXT:    scvtf s18, x12
+; CHECK-NEXT:    scvtf s6, x14
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v16.4s
+; CHECK-NEXT:    scvtf s17, x10
+; CHECK-NEXT:    mov x10, v7.d[1]
+; CHECK-NEXT:    scvtf s7, x8
+; CHECK-NEXT:    mov v20.s[1], v22.s[0]
+; CHECK-NEXT:    scvtf s22, x11
+; CHECK-NEXT:    fmov x11, d5
+; CHECK-NEXT:    mov x8, v5.d[1]
+; CHECK-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-NEXT:    mov v1.s[2], v6.s[0]
+; CHECK-NEXT:    scvtf s6, x13
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    scvtf s5, x11
+; CHECK-NEXT:    mov v19.s[2], v17.s[0]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    mov v3.s[3], v7.s[0]
+; CHECK-NEXT:    scvtf s7, x10
+; CHECK-NEXT:    mov v21.s[3], v22.s[0]
+; CHECK-NEXT:    mov v1.s[3], v6.s[0]
+; CHECK-NEXT:    movi v6.4s, #1
+; CHECK-NEXT:    mov v20.s[2], v5.s[0]
+; CHECK-NEXT:    scvtf s5, x8
+; CHECK-NEXT:    mov v19.s[3], v17.s[0]
+; CHECK-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-NEXT:    fmul v2.4s, v21.4s, v16.4s
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v16.4s
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-NEXT:    mov v20.s[3], v5.s[0]
+; CHECK-NEXT:    fmul v5.4s, v19.4s, v16.4s
+; CHECK-NEXT:    fcmeq v19.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fadd v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    fadd v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-NEXT:    fadd v4.4s, v5.4s, v20.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    add v5.4s, v0.4s, v7.4s
+; CHECK-NEXT:    ushr v17.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v1.4s, #16
+; CHECK-NEXT:    add v20.4s, v2.4s, v7.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    add v3.4s, v3.4s, v5.4s
+; CHECK-NEXT:    and v17.16b, v17.16b, v6.16b
+; CHECK-NEXT:    ushr v18.4s, v4.4s, #16
+; CHECK-NEXT:    and v5.16b, v16.16b, v6.16b
+; CHECK-NEXT:    add v16.4s, v1.4s, v7.4s
+; CHECK-NEXT:    add v7.4s, v4.4s, v7.4s
+; CHECK-NEXT:    bit v0.16b, v3.16b, v19.16b
+; CHECK-NEXT:    add v17.4s, v17.4s, v20.4s
+; CHECK-NEXT:    fcmeq v20.4s, v4.4s, v4.4s
+; CHECK-NEXT:    and v6.16b, v18.16b, v6.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v16.4s
+; CHECK-NEXT:    fcmeq v16.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fcmeq v18.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    add v6.4s, v6.4s, v7.4s
+; CHECK-NEXT:    mov v3.16b, v20.16b
+; CHECK-NEXT:    bit v1.16b, v5.16b, v16.16b
+; CHECK-NEXT:    bit v2.16b, v17.16b, v18.16b
+; CHECK-NEXT:    bsl v3.16b, v6.16b, v4.16b
+; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v3.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <16 x i64> %a to <16 x bfloat>
@@ -632,107 +890,162 @@ entry:
 define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: stofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v17.2d, v2.2d
-; CHECK-NEXT:    scvtf v18.2d, v0.2d
-; CHECK-NEXT:    scvtf v19.2d, v3.2d
-; CHECK-NEXT:    scvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v6.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    scvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-NEXT:    scvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    scvtf v22.2d, v22.2d
-; CHECK-NEXT:    scvtf v21.2d, v23.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    scvtf s19, x9
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf s16, x11
+; CHECK-NEXT:    mov x11, v6.d[1]
+; CHECK-NEXT:    scvtf s0, x12
+; CHECK-NEXT:    scvtf s18, x8
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s20, x10
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    mov x9, v7.d[1]
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    scvtf s21, x11
+; CHECK-NEXT:    fmov x11, d6
+; CHECK-NEXT:    mov v2.s[1], v18.s[0]
+; CHECK-NEXT:    scvtf s25, x8
+; CHECK-NEXT:    movi v6.4s, #127, msl #8
+; CHECK-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-NEXT:    ldp q24, q20, [sp, #32]
+; CHECK-NEXT:    scvtf s22, x9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    scvtf s1, x11
+; CHECK-NEXT:    scvtf s26, x10
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-NEXT:    ldp q18, q16, [sp]
+; CHECK-NEXT:    mov x8, v24.d[1]
+; CHECK-NEXT:    scvtf s4, x9
+; CHECK-NEXT:    fmov x9, d5
+; CHECK-NEXT:    mov v0.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[1], v21.s[0]
+; CHECK-NEXT:    scvtf s23, x11
+; CHECK-NEXT:    mov x11, v5.d[1]
+; CHECK-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-NEXT:    scvtf s21, x8
+; CHECK-NEXT:    mov x8, v20.d[1]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    fmov x9, d24
+; CHECK-NEXT:    mov v4.s[1], v26.s[0]
+; CHECK-NEXT:    mov v0.s[3], v25.s[0]
+; CHECK-NEXT:    ldp q26, q24, [sp, #96]
+; CHECK-NEXT:    mov v1.s[2], v23.s[0]
+; CHECK-NEXT:    ldp q25, q23, [sp, #64]
+; CHECK-NEXT:    scvtf s7, x11
+; CHECK-NEXT:    scvtf s27, x8
+; CHECK-NEXT:    fmov x8, d18
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov x10, v26.d[1]
+; CHECK-NEXT:    mov x9, v18.d[1]
+; CHECK-NEXT:    fmov x11, d20
+; CHECK-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[3], v22.s[0]
+; CHECK-NEXT:    ushr v19.4s, v2.4s, #16
+; CHECK-NEXT:    scvtf s17, x8
+; CHECK-NEXT:    fmov x8, d26
+; CHECK-NEXT:    add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT:    scvtf s22, x11
+; CHECK-NEXT:    mov x11, v25.d[1]
+; CHECK-NEXT:    mov v5.s[1], v21.s[0]
+; CHECK-NEXT:    scvtf s28, x10
+; CHECK-NEXT:    fmov x10, d16
+; CHECK-NEXT:    scvtf s21, x9
+; CHECK-NEXT:    fmov x9, d25
+; CHECK-NEXT:    scvtf s18, x8
+; CHECK-NEXT:    mov x8, v16.d[1]
+; CHECK-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-NEXT:    and v19.16b, v19.16b, v3.16b
+; CHECK-NEXT:    scvtf s16, x10
+; CHECK-NEXT:    fmov x10, d24
+; CHECK-NEXT:    scvtf s25, x11
+; CHECK-NEXT:    scvtf s20, x9
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    mov v17.s[1], v21.s[0]
+; CHECK-NEXT:    fmov x11, d23
+; CHECK-NEXT:    mov v18.s[1], v28.s[0]
+; CHECK-NEXT:    scvtf s24, x8
+; CHECK-NEXT:    scvtf s21, x10
+; CHECK-NEXT:    mov x10, v23.d[1]
+; CHECK-NEXT:    mov v5.s[2], v22.s[0]
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v0.4s, #16
+; CHECK-NEXT:    scvtf s23, x11
+; CHECK-NEXT:    mov v20.s[1], v25.s[0]
+; CHECK-NEXT:    scvtf s25, x9
+; CHECK-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-NEXT:    add v16.4s, v19.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v4.4s, #16
+; CHECK-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-NEXT:    scvtf s7, x10
+; CHECK-NEXT:    and v22.16b, v22.16b, v3.16b
+; CHECK-NEXT:    mov v5.s[3], v27.s[0]
+; CHECK-NEXT:    and v21.16b, v28.16b, v3.16b
+; CHECK-NEXT:    fcmeq v19.4s, v2.4s, v2.4s
+; CHECK-NEXT:    mov v20.s[2], v23.s[0]
+; CHECK-NEXT:    add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    mov v17.s[3], v24.s[0]
+; CHECK-NEXT:    add v24.4s, v1.4s, v6.4s
+; CHECK-NEXT:    fcmeq v27.4s, v1.4s, v1.4s
+; CHECK-NEXT:    mov v18.s[3], v25.s[0]
+; CHECK-NEXT:    add v25.4s, v4.4s, v6.4s
 ; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    bit v2.16b, v16.16b, v19.16b
+; CHECK-NEXT:    mov v20.s[3], v7.s[0]
+; CHECK-NEXT:    add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT:    add v7.4s, v21.4s, v23.4s
+; CHECK-NEXT:    ushr v24.4s, v17.4s, #16
+; CHECK-NEXT:    and v23.16b, v26.16b, v3.16b
+; CHECK-NEXT:    ushr v26.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v17.4s, v6.4s
+; CHECK-NEXT:    add v31.4s, v18.4s, v6.4s
+; CHECK-NEXT:    fcmeq v21.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    bit v1.16b, v22.16b, v27.16b
+; CHECK-NEXT:    ushr v29.4s, v20.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v3.16b
 ; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    and v28.16b, v28.16b, v3.16b
+; CHECK-NEXT:    and v25.16b, v26.16b, v3.16b
+; CHECK-NEXT:    add v26.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT:    and v3.16b, v29.16b, v3.16b
 ; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT:    fcmeq v30.4s, v17.4s, v17.4s
 ; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v31.4s, v18.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v5.4s, v5.4s
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v20.4s, v20.4s
+; CHECK-NEXT:    orr v5.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    orr v20.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v7.16b, v21.16b
+; CHECK-NEXT:    mov v7.16b, v30.16b
+; CHECK-NEXT:    mov v16.16b, v31.16b
 ; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    bit v5.16b, v25.16b, v26.16b
+; CHECK-NEXT:    bif v3.16b, v20.16b, v6.16b
+; CHECK-NEXT:    bsl v7.16b, v24.16b, v17.16b
+; CHECK-NEXT:    bsl v16.16b, v28.16b, v18.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v1.8h
+; CHECK-NEXT:    uzp2 v2.8h, v7.8h, v5.8h
+; CHECK-NEXT:    uzp2 v3.8h, v3.8h, v16.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i64> %a to <32 x bfloat>
@@ -742,107 +1055,301 @@ entry:
 define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: utofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v17.2d, v2.2d
-; CHECK-NEXT:    ucvtf v18.2d, v0.2d
-; CHECK-NEXT:    ucvtf v19.2d, v3.2d
-; CHECK-NEXT:    ucvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v6.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    ucvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-NEXT:    ucvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    ucvtf v22.2d, v22.2d
-; CHECK-NEXT:    ucvtf v21.2d, v23.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT:    ushr v18.2d, v3.2d, #32
+; CHECK-NEXT:    ushr v19.2d, v0.2d, #32
+; CHECK-NEXT:    movi v16.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v17.2d, v2.2d, #32
+; CHECK-NEXT:    ushr v20.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v23.2d, v6.2d, #32
+; CHECK-NEXT:    fmov x13, d18
+; CHECK-NEXT:    mov x10, v18.d[1]
+; CHECK-NEXT:    mov x11, v19.d[1]
+; CHECK-NEXT:    and v24.16b, v2.16b, v16.16b
+; CHECK-NEXT:    mov x8, v17.d[1]
+; CHECK-NEXT:    fmov x9, d17
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v25.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v2.16b, v1.16b, v16.16b
+; CHECK-NEXT:    scvtf s26, x13
+; CHECK-NEXT:    fmov x13, d19
+; CHECK-NEXT:    and v1.16b, v6.16b, v16.16b
+; CHECK-NEXT:    mov x12, v24.d[1]
+; CHECK-NEXT:    scvtf s21, x9
+; CHECK-NEXT:    scvtf s27, x10
+; CHECK-NEXT:    scvtf s6, x8
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    mov x9, v20.d[1]
+; CHECK-NEXT:    scvtf s22, x13
+; CHECK-NEXT:    fmov x13, d20
+; CHECK-NEXT:    mov x8, v25.d[1]
+; CHECK-NEXT:    and v31.16b, v4.16b, v16.16b
+; CHECK-NEXT:    ushr v4.2d, v4.2d, #32
+; CHECK-NEXT:    scvtf s19, x12
+; CHECK-NEXT:    scvtf s18, x10
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    scvtf s3, x13
+; CHECK-NEXT:    fmov x13, d24
+; CHECK-NEXT:    mov v21.s[1], v6.s[0]
+; CHECK-NEXT:    scvtf s24, x11
+; CHECK-NEXT:    mov x11, v23.d[1]
+; CHECK-NEXT:    mov x12, v0.d[1]
+; CHECK-NEXT:    scvtf s28, x8
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    scvtf s17, x13
+; CHECK-NEXT:    fmov x13, d25
+; CHECK-NEXT:    mov v21.s[2], v26.s[0]
+; CHECK-NEXT:    ushr v26.2d, v7.2d, #32
+; CHECK-NEXT:    and v7.16b, v7.16b, v16.16b
+; CHECK-NEXT:    scvtf s29, x11
+; CHECK-NEXT:    scvtf s25, x12
+; CHECK-NEXT:    mov v22.s[1], v24.s[0]
+; CHECK-NEXT:    scvtf s20, x13
+; CHECK-NEXT:    mov x13, v1.d[1]
+; CHECK-NEXT:    ldp q1, q0, [sp, #64]
+; CHECK-NEXT:    mov v17.s[1], v19.s[0]
+; CHECK-NEXT:    scvtf s19, x10
+; CHECK-NEXT:    fmov x10, d23
+; CHECK-NEXT:    fmov x12, d26
+; CHECK-NEXT:    mov x11, v26.d[1]
+; CHECK-NEXT:    mov v18.s[1], v25.s[0]
+; CHECK-NEXT:    mov v21.s[3], v27.s[0]
+; CHECK-NEXT:    ldp q25, q24, [sp, #32]
+; CHECK-NEXT:    scvtf s30, x13
+; CHECK-NEXT:    scvtf s23, x10
+; CHECK-NEXT:    mov w10, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v17.s[2], v20.s[0]
+; CHECK-NEXT:    ldp q20, q6, [sp]
+; CHECK-NEXT:    fmov x13, d7
+; CHECK-NEXT:    scvtf s26, x12
+; CHECK-NEXT:    fmov x12, d2
+; CHECK-NEXT:    mov v22.s[2], v3.s[0]
+; CHECK-NEXT:    mov v19.s[1], v30.s[0]
+; CHECK-NEXT:    and v30.16b, v5.16b, v16.16b
+; CHECK-NEXT:    dup v2.4s, w10
+; CHECK-NEXT:    mov v17.s[3], v28.s[0]
+; CHECK-NEXT:    mov v23.s[1], v29.s[0]
+; CHECK-NEXT:    scvtf s28, x9
+; CHECK-NEXT:    mov x9, v7.d[1]
+; CHECK-NEXT:    scvtf s7, x13
+; CHECK-NEXT:    scvtf s27, x12
+; CHECK-NEXT:    mov x12, v31.d[1]
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    ushr v5.2d, v5.2d, #32
+; CHECK-NEXT:    fmul v21.4s, v21.4s, v2.4s
+; CHECK-NEXT:    mov v23.s[2], v26.s[0]
+; CHECK-NEXT:    scvtf s26, x11
+; CHECK-NEXT:    fmov x11, d31
+; CHECK-NEXT:    mov v19.s[2], v7.s[0]
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    fmov x9, d30
+; CHECK-NEXT:    scvtf s29, x12
+; CHECK-NEXT:    mov v18.s[2], v27.s[0]
+; CHECK-NEXT:    scvtf s27, x8
+; CHECK-NEXT:    scvtf s3, x11
+; CHECK-NEXT:    mov v22.s[3], v28.s[0]
+; CHECK-NEXT:    mov x8, v30.d[1]
+; CHECK-NEXT:    scvtf s28, x9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    ushr v30.2d, v25.2d, #32
+; CHECK-NEXT:    mov v23.s[3], v26.s[0]
+; CHECK-NEXT:    and v31.16b, v25.16b, v16.16b
+; CHECK-NEXT:    mov v19.s[3], v7.s[0]
+; CHECK-NEXT:    mov v18.s[3], v27.s[0]
+; CHECK-NEXT:    fmov x11, d5
+; CHECK-NEXT:    fadd v17.4s, v21.4s, v17.4s
+; CHECK-NEXT:    scvtf s4, x9
+; CHECK-NEXT:    mov x9, v30.d[1]
+; CHECK-NEXT:    mov v3.s[1], v29.s[0]
+; CHECK-NEXT:    scvtf s29, x10
+; CHECK-NEXT:    mov x10, v5.d[1]
+; CHECK-NEXT:    fmov x13, d30
+; CHECK-NEXT:    fmul v26.4s, v23.4s, v2.4s
+; CHECK-NEXT:    and v23.16b, v24.16b, v16.16b
+; CHECK-NEXT:    ushr v24.2d, v24.2d, #32
+; CHECK-NEXT:    fmul v25.4s, v22.4s, v2.4s
+; CHECK-NEXT:    mov x12, v31.d[1]
+; CHECK-NEXT:    scvtf s22, x11
+; CHECK-NEXT:    scvtf s27, x9
+; CHECK-NEXT:    scvtf s7, x13
+; CHECK-NEXT:    mov v3.s[2], v28.s[0]
+; CHECK-NEXT:    scvtf s30, x10
+; CHECK-NEXT:    fmov x10, d24
+; CHECK-NEXT:    scvtf s28, x8
+; CHECK-NEXT:    mov x8, v23.d[1]
+; CHECK-NEXT:    fmov x11, d31
+; CHECK-NEXT:    and v31.16b, v20.16b, v16.16b
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    mov v4.s[1], v29.s[0]
+; CHECK-NEXT:    ushr v24.2d, v20.2d, #32
+; CHECK-NEXT:    mov v7.s[1], v27.s[0]
+; CHECK-NEXT:    scvtf s27, x10
+; CHECK-NEXT:    fadd v19.4s, v26.4s, v19.4s
+; CHECK-NEXT:    scvtf s29, x12
+; CHECK-NEXT:    scvtf s5, x11
+; CHECK-NEXT:    fmov x11, d23
+; CHECK-NEXT:    scvtf s21, x8
+; CHECK-NEXT:    fmov x8, d31
+; CHECK-NEXT:    mov x10, v31.d[1]
+; CHECK-NEXT:    scvtf s26, x9
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    mov v4.s[2], v22.s[0]
+; CHECK-NEXT:    ldp q23, q22, [sp, #96]
+; CHECK-NEXT:    mov v7.s[2], v27.s[0]
+; CHECK-NEXT:    scvtf s20, x8
+; CHECK-NEXT:    fmov x8, d24
+; CHECK-NEXT:    ushr v24.2d, v6.2d, #32
+; CHECK-NEXT:    fadd v18.4s, v25.4s, v18.4s
+; CHECK-NEXT:    mov v3.s[3], v28.s[0]
+; CHECK-NEXT:    and v28.16b, v6.16b, v16.16b
+; CHECK-NEXT:    ushr v25.2d, v23.2d, #32
+; CHECK-NEXT:    mov v5.s[1], v29.s[0]
+; CHECK-NEXT:    scvtf s29, x11
+; CHECK-NEXT:    mov v7.s[3], v26.s[0]
+; CHECK-NEXT:    scvtf s26, x9
+; CHECK-NEXT:    scvtf s6, x8
+; CHECK-NEXT:    scvtf s27, x10
+; CHECK-NEXT:    fmov x11, d24
+; CHECK-NEXT:    fmov x9, d28
+; CHECK-NEXT:    mov x10, v25.d[1]
+; CHECK-NEXT:    mov x8, v28.d[1]
+; CHECK-NEXT:    mov v4.s[3], v30.s[0]
+; CHECK-NEXT:    mov v5.s[2], v29.s[0]
+; CHECK-NEXT:    and v29.16b, v23.16b, v16.16b
+; CHECK-NEXT:    mov v6.s[1], v26.s[0]
+; CHECK-NEXT:    scvtf s26, x11
+; CHECK-NEXT:    fmov x11, d25
+; CHECK-NEXT:    mov v20.s[1], v27.s[0]
+; CHECK-NEXT:    ushr v27.2d, v1.2d, #32
+; CHECK-NEXT:    scvtf s23, x9
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    scvtf s28, x10
+; CHECK-NEXT:    and v25.16b, v22.16b, v16.16b
+; CHECK-NEXT:    scvtf s24, x11
+; CHECK-NEXT:    fmov x10, d29
+; CHECK-NEXT:    ushr v22.2d, v22.2d, #32
+; CHECK-NEXT:    fmov x13, d27
+; CHECK-NEXT:    mov x12, v29.d[1]
+; CHECK-NEXT:    mov x11, v27.d[1]
+; CHECK-NEXT:    and v29.16b, v1.16b, v16.16b
+; CHECK-NEXT:    mov v6.s[2], v26.s[0]
+; CHECK-NEXT:    fmul v4.4s, v4.4s, v2.4s
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    fmov x10, d25
+; CHECK-NEXT:    scvtf s30, x9
+; CHECK-NEXT:    mov v24.s[1], v28.s[0]
+; CHECK-NEXT:    scvtf s27, x13
+; CHECK-NEXT:    ushr v28.2d, v0.2d, #32
+; CHECK-NEXT:    fmov x13, d22
+; CHECK-NEXT:    scvtf s31, x12
+; CHECK-NEXT:    mov x9, v25.d[1]
+; CHECK-NEXT:    mov x12, v29.d[1]
+; CHECK-NEXT:    scvtf s25, x11
+; CHECK-NEXT:    mov x11, v22.d[1]
+; CHECK-NEXT:    scvtf s22, x10
+; CHECK-NEXT:    fmov x10, d29
+; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT:    scvtf s29, x13
+; CHECK-NEXT:    fmov x13, d28
+; CHECK-NEXT:    mov v6.s[3], v30.s[0]
+; CHECK-NEXT:    mov v1.s[1], v31.s[0]
+; CHECK-NEXT:    scvtf s30, x9
+; CHECK-NEXT:    mov v20.s[2], v23.s[0]
+; CHECK-NEXT:    scvtf s16, x12
+; CHECK-NEXT:    mov x12, v28.d[1]
+; CHECK-NEXT:    scvtf s28, x10
+; CHECK-NEXT:    mov v27.s[1], v25.s[0]
+; CHECK-NEXT:    scvtf s25, x13
+; CHECK-NEXT:    fmov x13, d0
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    scvtf s0, x11
+; CHECK-NEXT:    mov v24.s[2], v29.s[0]
+; CHECK-NEXT:    mov v1.s[2], v22.s[0]
+; CHECK-NEXT:    movi v29.4s, #1
+; CHECK-NEXT:    movi v23.4s, #127, msl #8
+; CHECK-NEXT:    scvtf s26, x13
+; CHECK-NEXT:    scvtf s31, x12
+; CHECK-NEXT:    mov v28.s[1], v16.s[0]
+; CHECK-NEXT:    mov v27.s[2], v25.s[0]
+; CHECK-NEXT:    ushr v16.4s, v17.4s, #16
+; CHECK-NEXT:    scvtf s25, x8
+; CHECK-NEXT:    mov v24.s[3], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    fadd v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    mov v5.s[3], v21.s[0]
+; CHECK-NEXT:    fmul v4.4s, v7.4s, v2.4s
+; CHECK-NEXT:    mov v1.s[3], v30.s[0]
+; CHECK-NEXT:    mov v28.s[2], v26.s[0]
+; CHECK-NEXT:    and v16.16b, v16.16b, v29.16b
+; CHECK-NEXT:    add v22.4s, v17.4s, v23.4s
+; CHECK-NEXT:    mov v27.s[3], v31.s[0]
+; CHECK-NEXT:    ushr v26.4s, v18.4s, #16
+; CHECK-NEXT:    mov v20.s[3], v25.s[0]
+; CHECK-NEXT:    fmul v7.4s, v24.4s, v2.4s
+; CHECK-NEXT:    fmul v6.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ushr v31.4s, v19.4s, #16
+; CHECK-NEXT:    fadd v4.4s, v4.4s, v5.4s
+; CHECK-NEXT:    ushr v24.4s, v3.4s, #16
+; CHECK-NEXT:    mov v28.s[3], v0.s[0]
+; CHECK-NEXT:    and v21.16b, v26.16b, v29.16b
+; CHECK-NEXT:    fcmeq v26.4s, v19.4s, v19.4s
+; CHECK-NEXT:    fmul v0.4s, v27.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v16.4s, v22.4s
+; CHECK-NEXT:    add v22.4s, v18.4s, v23.4s
+; CHECK-NEXT:    fadd v1.4s, v7.4s, v1.4s
+; CHECK-NEXT:    fadd v6.4s, v6.4s, v20.4s
+; CHECK-NEXT:    and v5.16b, v31.16b, v29.16b
+; CHECK-NEXT:    add v20.4s, v19.4s, v23.4s
 ; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
-; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
-; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
-; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
-; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
-; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
 ; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    add v7.4s, v21.4s, v22.4s
+; CHECK-NEXT:    and v22.16b, v24.16b, v29.16b
+; CHECK-NEXT:    add v24.4s, v3.4s, v23.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v28.4s
 ; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
-; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    fcmeq v21.4s, v18.4s, v18.4s
+; CHECK-NEXT:    ushr v27.4s, v1.4s, #16
+; CHECK-NEXT:    add v5.4s, v5.4s, v20.4s
+; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
+; CHECK-NEXT:    add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT:    and v24.16b, v25.16b, v29.16b
+; CHECK-NEXT:    add v25.4s, v4.4s, v23.4s
+; CHECK-NEXT:    add v30.4s, v6.4s, v23.4s
+; CHECK-NEXT:    add v31.4s, v1.4s, v23.4s
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    ushr v28.4s, v0.4s, #16
+; CHECK-NEXT:    and v27.16b, v27.16b, v29.16b
+; CHECK-NEXT:    add v23.4s, v0.4s, v23.4s
+; CHECK-NEXT:    and v20.16b, v20.16b, v29.16b
+; CHECK-NEXT:    add v24.4s, v24.4s, v25.4s
+; CHECK-NEXT:    fcmeq v25.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v19.4s, #64, lsl #16
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    bif v2.16b, v17.16b, v16.16b
+; CHECK-NEXT:    and v28.16b, v28.16b, v29.16b
+; CHECK-NEXT:    add v27.4s, v27.4s, v31.4s
+; CHECK-NEXT:    fcmeq v31.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fcmeq v29.4s, v3.4s, v3.4s
+; CHECK-NEXT:    add v20.4s, v20.4s, v30.4s
+; CHECK-NEXT:    fcmeq v30.4s, v6.4s, v6.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    add v23.4s, v28.4s, v23.4s
+; CHECK-NEXT:    fcmeq v28.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    mov v16.16b, v31.16b
+; CHECK-NEXT:    bif v7.16b, v18.16b, v21.16b
+; CHECK-NEXT:    bif v5.16b, v19.16b, v26.16b
+; CHECK-NEXT:    bit v3.16b, v22.16b, v29.16b
+; CHECK-NEXT:    bit v4.16b, v24.16b, v25.16b
+; CHECK-NEXT:    bit v6.16b, v20.16b, v30.16b
+; CHECK-NEXT:    mov v17.16b, v28.16b
+; CHECK-NEXT:    bsl v16.16b, v27.16b, v1.16b
+; CHECK-NEXT:    uzp2 v1.8h, v3.8h, v5.8h
+; CHECK-NEXT:    bsl v17.16b, v23.16b, v0.16b
+; CHECK-NEXT:    uzp2 v0.8h, v7.8h, v2.8h
+; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v4.8h
+; CHECK-NEXT:    uzp2 v3.8h, v17.8h, v16.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i64> %a to <32 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 81c1a64f2d434..865f473fe0d28 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4421,22 +4421,53 @@ entry:
 }
 
 define <2 x float> @stofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: stofp_v2i64_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i64_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i64_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %c
 }
 
 define <2 x float> @utofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: utofp_v2i64_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i64_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v2.2d, v0.2d, #32
+; CHECK-SD-NEXT:    mov x8, v2.d[1]
+; CHECK-SD-NEXT:    fmov x9, d2
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    scvtf s2, x9
+; CHECK-SD-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    dup v3.2s, w9
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    scvtf s0, x8
+; CHECK-SD-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-SD-NEXT:    fadd v0.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i64_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %c
@@ -4446,13 +4477,18 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v3i64_v3f32:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    scvtf s3, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    fmov x8, d2
+; CHECK-SD-NEXT:    mov v0.s[0], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v3i64_v3f32:
@@ -4480,11 +4516,38 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    ushr v5.2d, v2.2d, #32
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    mov x8, v3.d[1]
+; CHECK-SD-NEXT:    fmov x10, d3
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x10
+; CHECK-SD-NEXT:    scvtf s3, x8
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    fmov x8, d5
+; CHECK-SD-NEXT:    mov v4.s[1], v3.s[0]
+; CHECK-SD-NEXT:    scvtf s3, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT:    scvtf s0, x8
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v4.s[2], v3.s[0]
+; CHECK-SD-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT:    scvtf s0, x8
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    mov v4.s[3], v1.s[0]
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT:    fmul v0.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v3i64_v3f32:
@@ -4507,26 +4570,76 @@ entry:
 }
 
 define <4 x float> @stofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: stofp_v4i64_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v4i64_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    mov x9, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v4i64_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %c
 }
 
 define <4 x float> @utofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: utofp_v4i64_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v4i64_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT:    ushr v4.2d, v1.2d, #32
+; CHECK-SD-NEXT:    mov x8, v3.d[1]
+; CHECK-SD-NEXT:    fmov x10, d3
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT:    scvtf s3, x10
+; CHECK-SD-NEXT:    scvtf s5, x8
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    fmov x8, d4
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    mov x9, v4.d[1]
+; CHECK-SD-NEXT:    mov v3.s[1], v5.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT:    scvtf s0, x8
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v3.s[2], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT:    scvtf s0, x8
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT:    dup v1.4s, w8
+; CHECK-SD-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT:    fmul v0.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v4i64_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %c
@@ -4535,14 +4648,29 @@ entry:
 define <8 x float> @stofp_v8i64_v8f32(<8 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v8i64_v8f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    scvtf v4.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    scvtf s0, x10
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    scvtf s2, x11
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    fmov x11, d3
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-SD-NEXT:    scvtf s3, x11
+; CHECK-SD-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x10
+; CHECK-SD-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v8i64_v8f32:
@@ -4564,14 +4692,65 @@ entry:
 define <8 x float> @utofp_v8i64_v8f32(<8 x i64> %a) {
 ; CHECK-SD-LABEL: utofp_v8i64_v8f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ucvtf v4.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT:    movi v4.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v5.2d, v0.2d, #32
+; CHECK-SD-NEXT:    ushr v6.2d, v2.2d, #32
+; CHECK-SD-NEXT:    ushr v7.2d, v1.2d, #32
+; CHECK-SD-NEXT:    ushr v16.2d, v3.2d, #32
+; CHECK-SD-NEXT:    mov x8, v5.d[1]
+; CHECK-SD-NEXT:    mov x9, v6.d[1]
+; CHECK-SD-NEXT:    fmov x10, d5
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT:    fmov x13, d6
+; CHECK-SD-NEXT:    fmov x12, d7
+; CHECK-SD-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-SD-NEXT:    mov x11, v7.d[1]
+; CHECK-SD-NEXT:    scvtf s5, x10
+; CHECK-SD-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT:    mov x10, v0.d[1]
+; CHECK-SD-NEXT:    scvtf s6, x8
+; CHECK-SD-NEXT:    mov x8, v2.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x13
+; CHECK-SD-NEXT:    scvtf s7, x9
+; CHECK-SD-NEXT:    fmov x9, d16
+; CHECK-SD-NEXT:    scvtf s17, x12
+; CHECK-SD-NEXT:    fmov x12, d0
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    scvtf s2, x10
+; CHECK-SD-NEXT:    mov v5.s[1], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x8
+; CHECK-SD-NEXT:    scvtf s0, x12
+; CHECK-SD-NEXT:    scvtf s18, x13
+; CHECK-SD-NEXT:    mov x8, v16.d[1]
+; CHECK-SD-NEXT:    mov v4.s[1], v7.s[0]
+; CHECK-SD-NEXT:    scvtf s7, x9
+; CHECK-SD-NEXT:    fmov x10, d1
+; CHECK-SD-NEXT:    fmov x13, d3
+; CHECK-SD-NEXT:    mov x9, v1.d[1]
+; CHECK-SD-NEXT:    mov x12, v3.d[1]
+; CHECK-SD-NEXT:    mov v5.s[2], v17.s[0]
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x10
+; CHECK-SD-NEXT:    mov v18.s[1], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s2, x11
+; CHECK-SD-NEXT:    scvtf s3, x13
+; CHECK-SD-NEXT:    mov v4.s[2], v7.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x8
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v5.s[3], v2.s[0]
+; CHECK-SD-NEXT:    scvtf s2, x12
+; CHECK-SD-NEXT:    mov v18.s[2], v3.s[0]
+; CHECK-SD-NEXT:    mov v4.s[3], v6.s[0]
+; CHECK-SD-NEXT:    dup v3.4s, w8
+; CHECK-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT:    fmul v1.4s, v5.4s, v3.4s
+; CHECK-SD-NEXT:    mov v18.s[3], v2.s[0]
+; CHECK-SD-NEXT:    fmul v2.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    fadd v1.4s, v2.4s, v18.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v8i64_v8f32:
@@ -4591,50 +4770,218 @@ entry:
 }
 
 define <16 x float> @stofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: stofp_v16i64_v16f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v16.2d, v1.2d
-; CHECK-NEXT:    scvtf v17.2d, v3.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v16i64_v16f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    mov x10, v2.d[1]
+; CHECK-SD-NEXT:    fmov x11, d0
+; CHECK-SD-NEXT:    mov x12, v4.d[1]
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    scvtf s0, x11
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    scvtf s17, x9
+; CHECK-SD-NEXT:    scvtf s18, x10
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    scvtf s1, x12
+; CHECK-SD-NEXT:    fmov x12, d6
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    fmov x13, d3
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x11
+; CHECK-SD-NEXT:    mov v0.s[1], v17.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x9
+; CHECK-SD-NEXT:    scvtf s3, x12
+; CHECK-SD-NEXT:    mov v16.s[1], v18.s[0]
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    scvtf s5, x13
+; CHECK-SD-NEXT:    fmov x13, d7
+; CHECK-SD-NEXT:    mov x12, v7.d[1]
+; CHECK-SD-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x10
+; CHECK-SD-NEXT:    scvtf s7, x11
+; CHECK-SD-NEXT:    scvtf s1, x13
+; CHECK-SD-NEXT:    mov v3.s[1], v4.s[0]
+; CHECK-SD-NEXT:    mov v16.s[2], v5.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    mov v2.s[2], v7.s[0]
+; CHECK-SD-NEXT:    mov v3.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x12
+; CHECK-SD-NEXT:    mov v16.s[3], v6.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v16.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v16i64_v16f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-GI-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-GI-NEXT:    scvtf v16.2d, v1.2d
+; CHECK-GI-NEXT:    scvtf v17.2d, v3.2d
+; CHECK-GI-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-GI-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT:    fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT:    fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT:    fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %c
 }
 
 define <16 x float> @utofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: utofp_v16i64_v16f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v16.2d, v1.2d
-; CHECK-NEXT:    ucvtf v17.2d, v3.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v16i64_v16f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v17.2d, v0.2d, #32
+; CHECK-SD-NEXT:    ushr v18.2d, v1.2d, #32
+; CHECK-SD-NEXT:    ushr v20.2d, v2.2d, #32
+; CHECK-SD-NEXT:    ushr v19.2d, v3.2d, #32
+; CHECK-SD-NEXT:    ushr v22.2d, v6.2d, #32
+; CHECK-SD-NEXT:    ushr v21.2d, v4.2d, #32
+; CHECK-SD-NEXT:    mov x8, v17.d[1]
+; CHECK-SD-NEXT:    fmov x9, d17
+; CHECK-SD-NEXT:    mov x10, v18.d[1]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT:    and v23.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT:    and v17.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT:    mov x12, v20.d[1]
+; CHECK-SD-NEXT:    fmov x13, d20
+; CHECK-SD-NEXT:    and v6.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov x9, v19.d[1]
+; CHECK-SD-NEXT:    mov x11, v0.d[1]
+; CHECK-SD-NEXT:    scvtf s24, x8
+; CHECK-SD-NEXT:    fmov x8, d18
+; CHECK-SD-NEXT:    and v18.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    fmov x13, d17
+; CHECK-SD-NEXT:    scvtf s25, x8
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    scvtf s20, x11
+; CHECK-SD-NEXT:    fmov x11, d23
+; CHECK-SD-NEXT:    scvtf s0, x14
+; CHECK-SD-NEXT:    mov x14, v17.d[1]
+; CHECK-SD-NEXT:    scvtf s23, x12
+; CHECK-SD-NEXT:    scvtf s4, x13
+; CHECK-SD-NEXT:    fmov x12, d19
+; CHECK-SD-NEXT:    scvtf s19, x10
+; CHECK-SD-NEXT:    fmov x10, d18
+; CHECK-SD-NEXT:    scvtf s17, x11
+; CHECK-SD-NEXT:    mov x11, v18.d[1]
+; CHECK-SD-NEXT:    mov v1.s[1], v24.s[0]
+; CHECK-SD-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-SD-NEXT:    and v24.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s20, x14
+; CHECK-SD-NEXT:    scvtf s3, x10
+; CHECK-SD-NEXT:    mov x10, v21.d[1]
+; CHECK-SD-NEXT:    fmov x14, d21
+; CHECK-SD-NEXT:    mov v2.s[1], v23.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x11
+; CHECK-SD-NEXT:    mov x11, v22.d[1]
+; CHECK-SD-NEXT:    mov v1.s[2], v25.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v17.s[0]
+; CHECK-SD-NEXT:    ushr v17.2d, v5.2d, #32
+; CHECK-SD-NEXT:    fmov x13, d24
+; CHECK-SD-NEXT:    mov v4.s[1], v20.s[0]
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s25, x12
+; CHECK-SD-NEXT:    mov x12, v24.d[1]
+; CHECK-SD-NEXT:    scvtf s20, x11
+; CHECK-SD-NEXT:    fmov x11, d22
+; CHECK-SD-NEXT:    mov v1.s[3], v19.s[0]
+; CHECK-SD-NEXT:    mov v3.s[1], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x10
+; CHECK-SD-NEXT:    scvtf s19, x14
+; CHECK-SD-NEXT:    mov x14, v17.d[1]
+; CHECK-SD-NEXT:    mov x10, v6.d[1]
+; CHECK-SD-NEXT:    mov v2.s[2], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s21, x11
+; CHECK-SD-NEXT:    fmov x11, d17
+; CHECK-SD-NEXT:    ushr v17.2d, v7.2d, #32
+; CHECK-SD-NEXT:    and v7.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    fmov x13, d5
+; CHECK-SD-NEXT:    mov v19.s[1], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s22, x11
+; CHECK-SD-NEXT:    fmov x11, d6
+; CHECK-SD-NEXT:    scvtf s6, x10
+; CHECK-SD-NEXT:    mov x10, v17.d[1]
+; CHECK-SD-NEXT:    mov v21.s[1], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s20, x13
+; CHECK-SD-NEXT:    fmov x13, d7
+; CHECK-SD-NEXT:    mov v4.s[2], v16.s[0]
+; CHECK-SD-NEXT:    scvtf s16, x8
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    scvtf s18, x11
+; CHECK-SD-NEXT:    fmov x11, d17
+; CHECK-SD-NEXT:    mov v19.s[2], v22.s[0]
+; CHECK-SD-NEXT:    mov v3.s[2], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s20, x12
+; CHECK-SD-NEXT:    scvtf s17, x11
+; CHECK-SD-NEXT:    mov x11, v5.d[1]
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    mov v18.s[1], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x14
+; CHECK-SD-NEXT:    mov x9, v7.d[1]
+; CHECK-SD-NEXT:    scvtf s7, x13
+; CHECK-SD-NEXT:    mov v0.s[3], v16.s[0]
+; CHECK-SD-NEXT:    mov v4.s[3], v20.s[0]
+; CHECK-SD-NEXT:    mov v21.s[2], v17.s[0]
+; CHECK-SD-NEXT:    scvtf s17, x10
+; CHECK-SD-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT:    mov v19.s[3], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x11
+; CHECK-SD-NEXT:    dup v5.4s, w8
+; CHECK-SD-NEXT:    mov v18.s[2], v7.s[0]
+; CHECK-SD-NEXT:    scvtf s7, x9
+; CHECK-SD-NEXT:    mov v21.s[3], v17.s[0]
+; CHECK-SD-NEXT:    fmul v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    fmul v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    mov v3.s[3], v6.s[0]
+; CHECK-SD-NEXT:    fmul v6.4s, v19.4s, v5.4s
+; CHECK-SD-NEXT:    mov v18.s[3], v7.s[0]
+; CHECK-SD-NEXT:    fmul v5.4s, v21.4s, v5.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    fadd v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    fadd v2.4s, v6.4s, v3.4s
+; CHECK-SD-NEXT:    fadd v3.4s, v5.4s, v18.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v16i64_v16f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-GI-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-GI-NEXT:    ucvtf v16.2d, v1.2d
+; CHECK-GI-NEXT:    ucvtf v17.2d, v3.2d
+; CHECK-GI-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-GI-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT:    fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT:    fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT:    fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %c
@@ -4643,42 +4990,99 @@ entry:
 define <32 x float> @stofp_v32i64_v32f32(<32 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v32i64_v32f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ldp q21, q20, [sp]
-; CHECK-SD-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT:    ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-SD-NEXT:    scvtf v17.2d, v17.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    scvtf v24.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    scvtf v23.2d, v23.2d
-; CHECK-SD-NEXT:    scvtf v25.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT:    scvtf v26.2d, v5.2d
-; CHECK-SD-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT:    scvtf v27.2d, v7.2d
-; CHECK-SD-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-SD-NEXT:    fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-SD-NEXT:    fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT:    fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT:    scvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT:    scvtf v17.2d, v22.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT:    fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT:    fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT:    fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT:    fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT:    fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT:    fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov v16.16b, v1.16b
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    ldp q24, q20, [sp]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x12, d3
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    scvtf s0, x10
+; CHECK-SD-NEXT:    ldp q21, q18, [sp, #32]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    scvtf s1, x11
+; CHECK-SD-NEXT:    mov x10, v4.d[1]
+; CHECK-SD-NEXT:    fmov x11, d16
+; CHECK-SD-NEXT:    ldp q19, q17, [sp, #96]
+; CHECK-SD-NEXT:    scvtf s22, x9
+; CHECK-SD-NEXT:    mov x8, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x12
+; CHECK-SD-NEXT:    mov x12, v24.d[1]
+; CHECK-SD-NEXT:    mov x9, v16.d[1]
+; CHECK-SD-NEXT:    scvtf s3, x11
+; CHECK-SD-NEXT:    ldp q23, q16, [sp, #64]
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    scvtf s25, x10
+; CHECK-SD-NEXT:    fmov x10, d6
+; CHECK-SD-NEXT:    mov v1.s[1], v22.s[0]
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    mov x13, v21.d[1]
+; CHECK-SD-NEXT:    fmov x14, d19
+; CHECK-SD-NEXT:    scvtf s22, x9
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x15, d17
+; CHECK-SD-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-SD-NEXT:    scvtf s3, x10
+; CHECK-SD-NEXT:    fmov x10, d24
+; CHECK-SD-NEXT:    mov v1.s[2], v4.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x12
+; CHECK-SD-NEXT:    scvtf s6, x11
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    fmov x12, d7
+; CHECK-SD-NEXT:    mov v2.s[1], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x10
+; CHECK-SD-NEXT:    fmov x10, d21
+; CHECK-SD-NEXT:    scvtf s21, x8
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    scvtf s25, x13
+; CHECK-SD-NEXT:    mov x13, v19.d[1]
+; CHECK-SD-NEXT:    scvtf s26, x11
+; CHECK-SD-NEXT:    mov x11, v20.d[1]
+; CHECK-SD-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s5, x10
+; CHECK-SD-NEXT:    mov x10, v7.d[1]
+; CHECK-SD-NEXT:    scvtf s7, x14
+; CHECK-SD-NEXT:    mov v4.s[1], v24.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x12
+; CHECK-SD-NEXT:    fmov x12, d20
+; CHECK-SD-NEXT:    scvtf s20, x8
+; CHECK-SD-NEXT:    fmov x8, d23
+; CHECK-SD-NEXT:    scvtf s19, x13
+; CHECK-SD-NEXT:    fmov x13, d18
+; CHECK-SD-NEXT:    fmov x14, d16
+; CHECK-SD-NEXT:    mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT:    mov v5.s[1], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s23, x10
+; CHECK-SD-NEXT:    mov v0.s[3], v22.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x8
+; CHECK-SD-NEXT:    mov x8, v18.d[1]
+; CHECK-SD-NEXT:    scvtf s18, x12
+; CHECK-SD-NEXT:    mov x12, v16.d[1]
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    mov x13, v17.d[1]
+; CHECK-SD-NEXT:    scvtf s17, x14
+; CHECK-SD-NEXT:    mov v7.s[1], v19.s[0]
+; CHECK-SD-NEXT:    scvtf s19, x9
+; CHECK-SD-NEXT:    mov v3.s[2], v24.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x11
+; CHECK-SD-NEXT:    mov v1.s[3], v21.s[0]
+; CHECK-SD-NEXT:    mov v6.s[1], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s20, x15
+; CHECK-SD-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x8
+; CHECK-SD-NEXT:    mov v5.s[2], v16.s[0]
+; CHECK-SD-NEXT:    scvtf s16, x12
+; CHECK-SD-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v23.s[0]
+; CHECK-SD-NEXT:    mov v6.s[2], v17.s[0]
+; CHECK-SD-NEXT:    mov v7.s[2], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s17, x13
+; CHECK-SD-NEXT:    mov v4.s[3], v24.s[0]
+; CHECK-SD-NEXT:    mov v5.s[3], v18.s[0]
+; CHECK-SD-NEXT:    mov v6.s[3], v16.s[0]
+; CHECK-SD-NEXT:    mov v7.s[3], v17.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v32i64_v32f32:
@@ -4728,42 +5132,242 @@ entry:
 define <32 x float> @utofp_v32i64_v32f32(<32 x i64> %a) {
 ; CHECK-SD-LABEL: utofp_v32i64_v32f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ldp q21, q20, [sp]
-; CHECK-SD-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-SD-NEXT:    ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-SD-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    ucvtf v24.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    ucvtf v23.2d, v23.2d
-; CHECK-SD-NEXT:    ucvtf v25.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT:    ucvtf v26.2d, v5.2d
-; CHECK-SD-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT:    ucvtf v27.2d, v7.2d
-; CHECK-SD-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-SD-NEXT:    fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-SD-NEXT:    fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT:    fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT:    ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT:    ucvtf v17.2d, v22.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT:    fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT:    fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT:    fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT:    fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT:    fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT:    fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT:    stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset b8, -8
+; CHECK-SD-NEXT:    .cfi_offset b9, -16
+; CHECK-SD-NEXT:    movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT:    ushr v17.2d, v0.2d, #32
+; CHECK-SD-NEXT:    ushr v19.2d, v2.2d, #32
+; CHECK-SD-NEXT:    ushr v18.2d, v1.2d, #32
+; CHECK-SD-NEXT:    ushr v21.2d, v3.2d, #32
+; CHECK-SD-NEXT:    ushr v28.2d, v4.2d, #32
+; CHECK-SD-NEXT:    ushr v31.2d, v6.2d, #32
+; CHECK-SD-NEXT:    mov x8, v17.d[1]
+; CHECK-SD-NEXT:    fmov x13, d17
+; CHECK-SD-NEXT:    mov x10, v19.d[1]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT:    and v20.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT:    and v1.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT:    mov x9, v18.d[1]
+; CHECK-SD-NEXT:    fmov x14, d18
+; CHECK-SD-NEXT:    and v23.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    and v24.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT:    and v29.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT:    mov x11, v0.d[1]
+; CHECK-SD-NEXT:    fmov x13, d0
+; CHECK-SD-NEXT:    scvtf s17, x8
+; CHECK-SD-NEXT:    fmov x8, d19
+; CHECK-SD-NEXT:    scvtf s25, x10
+; CHECK-SD-NEXT:    fmov x10, d20
+; CHECK-SD-NEXT:    mov x12, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s22, x9
+; CHECK-SD-NEXT:    fmov x9, d23
+; CHECK-SD-NEXT:    scvtf s0, x13
+; CHECK-SD-NEXT:    scvtf s26, x14
+; CHECK-SD-NEXT:    and v30.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s18, x11
+; CHECK-SD-NEXT:    fmov x11, d1
+; CHECK-SD-NEXT:    scvtf s3, x8
+; CHECK-SD-NEXT:    scvtf s27, x10
+; CHECK-SD-NEXT:    mov x10, v24.d[1]
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    mov v2.s[1], v17.s[0]
+; CHECK-SD-NEXT:    scvtf s23, x9
+; CHECK-SD-NEXT:    mov x9, v21.d[1]
+; CHECK-SD-NEXT:    scvtf s1, x11
+; CHECK-SD-NEXT:    fmov x11, d21
+; CHECK-SD-NEXT:    scvtf s19, x12
+; CHECK-SD-NEXT:    mov v0.s[1], v18.s[0]
+; CHECK-SD-NEXT:    ldp q18, q17, [sp, #112]
+; CHECK-SD-NEXT:    mov v3.s[1], v25.s[0]
+; CHECK-SD-NEXT:    mov x14, v20.d[1]
+; CHECK-SD-NEXT:    fmov x13, d28
+; CHECK-SD-NEXT:    scvtf s25, x11
+; CHECK-SD-NEXT:    fmov x11, d24
+; CHECK-SD-NEXT:    mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT:    scvtf s26, x8
+; CHECK-SD-NEXT:    mov x8, v29.d[1]
+; CHECK-SD-NEXT:    fmov x12, d29
+; CHECK-SD-NEXT:    mov v0.s[2], v27.s[0]
+; CHECK-SD-NEXT:    scvtf s27, x10
+; CHECK-SD-NEXT:    mov x10, v28.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x11
+; CHECK-SD-NEXT:    ushr v29.2d, v5.2d, #32
+; CHECK-SD-NEXT:    mov v1.s[1], v19.s[0]
+; CHECK-SD-NEXT:    ldp q21, q19, [sp, #80]
+; CHECK-SD-NEXT:    scvtf s6, x13
+; CHECK-SD-NEXT:    scvtf s20, x14
+; CHECK-SD-NEXT:    mov x14, v31.d[1]
+; CHECK-SD-NEXT:    scvtf s9, x8
+; CHECK-SD-NEXT:    scvtf s28, x10
+; CHECK-SD-NEXT:    fmov x8, d29
+; CHECK-SD-NEXT:    mov x11, v30.d[1]
+; CHECK-SD-NEXT:    mov v4.s[1], v27.s[0]
+; CHECK-SD-NEXT:    scvtf s27, x9
+; CHECK-SD-NEXT:    fmov x9, d30
+; CHECK-SD-NEXT:    mov v1.s[2], v23.s[0]
+; CHECK-SD-NEXT:    ldp q24, q23, [sp, #16]
+; CHECK-SD-NEXT:    mov v0.s[3], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s8, x14
+; CHECK-SD-NEXT:    mov v3.s[2], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    fmov x9, d31
+; CHECK-SD-NEXT:    and v31.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT:    mov v6.s[1], v28.s[0]
+; CHECK-SD-NEXT:    ushr v7.2d, v7.2d, #32
+; CHECK-SD-NEXT:    scvtf s28, x8
+; CHECK-SD-NEXT:    scvtf s30, x11
+; CHECK-SD-NEXT:    scvtf s25, x12
+; CHECK-SD-NEXT:    mov v1.s[3], v26.s[0]
+; CHECK-SD-NEXT:    fmov x8, d31
+; CHECK-SD-NEXT:    scvtf s20, x9
+; CHECK-SD-NEXT:    mov x9, v29.d[1]
+; CHECK-SD-NEXT:    fmov x11, d7
+; CHECK-SD-NEXT:    mov x10, v31.d[1]
+; CHECK-SD-NEXT:    mov v3.s[3], v27.s[0]
+; CHECK-SD-NEXT:    mov v6.s[2], v28.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v22.s[0]
+; CHECK-SD-NEXT:    scvtf s29, x8
+; CHECK-SD-NEXT:    mov x8, v7.d[1]
+; CHECK-SD-NEXT:    ushr v7.2d, v24.2d, #32
+; CHECK-SD-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-SD-NEXT:    mov v5.s[1], v30.s[0]
+; CHECK-SD-NEXT:    scvtf s30, x11
+; CHECK-SD-NEXT:    mov v20.s[1], v8.s[0]
+; CHECK-SD-NEXT:    scvtf s26, x9
+; CHECK-SD-NEXT:    mov v4.s[2], v25.s[0]
+; CHECK-SD-NEXT:    ldp q31, q25, [sp, #48]
+; CHECK-SD-NEXT:    mov x11, v7.d[1]
+; CHECK-SD-NEXT:    mov x9, v24.d[1]
+; CHECK-SD-NEXT:    fmov x12, d7
+; CHECK-SD-NEXT:    scvtf s27, x10
+; CHECK-SD-NEXT:    fmov x10, d24
+; CHECK-SD-NEXT:    ushr v24.2d, v23.2d, #32
+; CHECK-SD-NEXT:    mov v5.s[2], v29.s[0]
+; CHECK-SD-NEXT:    mov v20.s[2], v30.s[0]
+; CHECK-SD-NEXT:    ushr v29.2d, v31.2d, #32
+; CHECK-SD-NEXT:    scvtf s30, x8
+; CHECK-SD-NEXT:    scvtf s28, x11
+; CHECK-SD-NEXT:    scvtf s7, x12
+; CHECK-SD-NEXT:    and v31.16b, v31.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s8, x9
+; CHECK-SD-NEXT:    scvtf s22, x10
+; CHECK-SD-NEXT:    fmov x8, d24
+; CHECK-SD-NEXT:    mov x9, v24.d[1]
+; CHECK-SD-NEXT:    mov x10, v29.d[1]
+; CHECK-SD-NEXT:    mov v5.s[3], v27.s[0]
+; CHECK-SD-NEXT:    mov x11, v31.d[1]
+; CHECK-SD-NEXT:    fmov x12, d29
+; CHECK-SD-NEXT:    and v29.16b, v21.16b, v16.16b
+; CHECK-SD-NEXT:    mov v7.s[1], v28.s[0]
+; CHECK-SD-NEXT:    scvtf s27, x8
+; CHECK-SD-NEXT:    and v23.16b, v23.16b, v16.16b
+; CHECK-SD-NEXT:    mov v22.s[1], v8.s[0]
+; CHECK-SD-NEXT:    ushr v8.2d, v25.2d, #32
+; CHECK-SD-NEXT:    mov v20.s[3], v30.s[0]
+; CHECK-SD-NEXT:    scvtf s30, x10
+; CHECK-SD-NEXT:    scvtf s24, x12
+; CHECK-SD-NEXT:    mov x10, v29.d[1]
+; CHECK-SD-NEXT:    scvtf s28, x11
+; CHECK-SD-NEXT:    fmov x12, d29
+; CHECK-SD-NEXT:    ushr v29.2d, v18.2d, #32
+; CHECK-SD-NEXT:    mov v7.s[2], v27.s[0]
+; CHECK-SD-NEXT:    scvtf s27, x9
+; CHECK-SD-NEXT:    fmov x9, d31
+; CHECK-SD-NEXT:    fmov x11, d8
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    fmov x13, d23
+; CHECK-SD-NEXT:    and v25.16b, v25.16b, v16.16b
+; CHECK-SD-NEXT:    mov v24.s[1], v30.s[0]
+; CHECK-SD-NEXT:    ushr v30.2d, v21.2d, #32
+; CHECK-SD-NEXT:    scvtf s23, x9
+; CHECK-SD-NEXT:    mov v6.s[3], v26.s[0]
+; CHECK-SD-NEXT:    scvtf s21, x12
+; CHECK-SD-NEXT:    scvtf s31, x11
+; CHECK-SD-NEXT:    mov x11, v29.d[1]
+; CHECK-SD-NEXT:    scvtf s26, x13
+; CHECK-SD-NEXT:    mov x12, v25.d[1]
+; CHECK-SD-NEXT:    fmov x13, d25
+; CHECK-SD-NEXT:    ushr v25.2d, v19.2d, #32
+; CHECK-SD-NEXT:    mov x9, v8.d[1]
+; CHECK-SD-NEXT:    scvtf s8, x10
+; CHECK-SD-NEXT:    mov x10, v30.d[1]
+; CHECK-SD-NEXT:    mov v23.s[1], v28.s[0]
+; CHECK-SD-NEXT:    fmov x14, d30
+; CHECK-SD-NEXT:    and v18.16b, v18.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s28, x11
+; CHECK-SD-NEXT:    fmov x11, d29
+; CHECK-SD-NEXT:    mov v22.s[2], v26.s[0]
+; CHECK-SD-NEXT:    mov v7.s[3], v27.s[0]
+; CHECK-SD-NEXT:    and v19.16b, v19.16b, v16.16b
+; CHECK-SD-NEXT:    and v16.16b, v17.16b, v16.16b
+; CHECK-SD-NEXT:    scvtf s26, x10
+; CHECK-SD-NEXT:    scvtf s27, x14
+; CHECK-SD-NEXT:    mov x14, v25.d[1]
+; CHECK-SD-NEXT:    scvtf s29, x11
+; CHECK-SD-NEXT:    fmov x11, d25
+; CHECK-SD-NEXT:    ushr v25.2d, v17.2d, #32
+; CHECK-SD-NEXT:    mov x10, v18.d[1]
+; CHECK-SD-NEXT:    scvtf s17, x13
+; CHECK-SD-NEXT:    fmov x13, d19
+; CHECK-SD-NEXT:    mov v24.s[2], v31.s[0]
+; CHECK-SD-NEXT:    mov v21.s[1], v8.s[0]
+; CHECK-SD-NEXT:    mov v4.s[3], v9.s[0]
+; CHECK-SD-NEXT:    scvtf s30, x11
+; CHECK-SD-NEXT:    fmov x11, d18
+; CHECK-SD-NEXT:    mov v27.s[1], v26.s[0]
+; CHECK-SD-NEXT:    mov v29.s[1], v28.s[0]
+; CHECK-SD-NEXT:    scvtf s28, x13
+; CHECK-SD-NEXT:    fmov x13, d16
+; CHECK-SD-NEXT:    scvtf s18, x10
+; CHECK-SD-NEXT:    mov x10, v25.d[1]
+; CHECK-SD-NEXT:    mov v23.s[2], v17.s[0]
+; CHECK-SD-NEXT:    scvtf s26, x11
+; CHECK-SD-NEXT:    fmov x11, d25
+; CHECK-SD-NEXT:    scvtf s17, x8
+; CHECK-SD-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT:    mov v27.s[2], v30.s[0]
+; CHECK-SD-NEXT:    mov v21.s[2], v28.s[0]
+; CHECK-SD-NEXT:    scvtf s28, x12
+; CHECK-SD-NEXT:    scvtf s25, x11
+; CHECK-SD-NEXT:    mov x11, v19.d[1]
+; CHECK-SD-NEXT:    scvtf s19, x9
+; CHECK-SD-NEXT:    mov v26.s[1], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x14
+; CHECK-SD-NEXT:    mov x9, v16.d[1]
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    mov v22.s[3], v17.s[0]
+; CHECK-SD-NEXT:    mov v23.s[3], v28.s[0]
+; CHECK-SD-NEXT:    mov v29.s[2], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s25, x10
+; CHECK-SD-NEXT:    mov v24.s[3], v19.s[0]
+; CHECK-SD-NEXT:    mov v27.s[3], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x11
+; CHECK-SD-NEXT:    dup v19.4s, w8
+; CHECK-SD-NEXT:    mov v26.s[2], v16.s[0]
+; CHECK-SD-NEXT:    scvtf s16, x9
+; CHECK-SD-NEXT:    mov v29.s[3], v25.s[0]
+; CHECK-SD-NEXT:    fmul v2.4s, v2.4s, v19.4s
+; CHECK-SD-NEXT:    fmul v3.4s, v3.4s, v19.4s
+; CHECK-SD-NEXT:    fmul v6.4s, v6.4s, v19.4s
+; CHECK-SD-NEXT:    fmul v17.4s, v20.4s, v19.4s
+; CHECK-SD-NEXT:    fmul v7.4s, v7.4s, v19.4s
+; CHECK-SD-NEXT:    fmul v20.4s, v24.4s, v19.4s
+; CHECK-SD-NEXT:    mov v21.s[3], v18.s[0]
+; CHECK-SD-NEXT:    fmul v18.4s, v27.4s, v19.4s
+; CHECK-SD-NEXT:    mov v26.s[3], v16.s[0]
+; CHECK-SD-NEXT:    fmul v16.4s, v29.4s, v19.4s
+; CHECK-SD-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    fadd v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    fadd v2.4s, v6.4s, v4.4s
+; CHECK-SD-NEXT:    fadd v3.4s, v17.4s, v5.4s
+; CHECK-SD-NEXT:    fadd v4.4s, v7.4s, v22.4s
+; CHECK-SD-NEXT:    fadd v5.4s, v20.4s, v23.4s
+; CHECK-SD-NEXT:    fadd v6.4s, v18.4s, v21.4s
+; CHECK-SD-NEXT:    fadd v7.4s, v16.4s, v26.4s
+; CHECK-SD-NEXT:    ldp d9, d8, [sp], #16 // 16-byte Folded Reload
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v32i64_v32f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 573fe3d8b8a77..ecc2e423c6e42 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -722,8 +722,23 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v2.2d, v0.2d, #32
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x9, d2
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    dup v3.2s, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEXT:    fadd v0.2s, v2.2s, v1.2s
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
@@ -733,8 +748,23 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v2.2d, v0.2d, #32
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x9, d2
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    dup v3.2s, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEXT:    fadd v0.2s, v2.2s, v1.2s
 ; CHECK-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1646,8 +1676,11 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
   %res = sitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
@@ -1657,8 +1690,12 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index 8f38bdbedc629..610e9e90ed160 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -87,14 +87,29 @@ define <8 x float> @sitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @sitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: sitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v4.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    scvtf v2.2d, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    mov x9, v2.d[1]
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x11, d2
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov x10, v3.d[1]
+; CHECK-NEXT:    scvtf s4, x8
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    scvtf s2, x11
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-NEXT:    scvtf s3, x11
+; CHECK-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-NEXT:    scvtf s4, x8
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    mov v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
@@ -177,14 +192,65 @@ define <8 x float> @uitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @uitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: uitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v4.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    ucvtf v2.2d, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT:    movi v4.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v5.2d, v0.2d, #32
+; CHECK-NEXT:    ushr v6.2d, v2.2d, #32
+; CHECK-NEXT:    ushr v7.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v16.2d, v3.2d, #32
+; CHECK-NEXT:    mov x8, v5.d[1]
+; CHECK-NEXT:    mov x9, v6.d[1]
+; CHECK-NEXT:    fmov x10, d5
+; CHECK-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    fmov x13, d6
+; CHECK-NEXT:    fmov x12, d7
+; CHECK-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT:    mov x11, v7.d[1]
+; CHECK-NEXT:    scvtf s5, x10
+; CHECK-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    scvtf s6, x8
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    scvtf s4, x13
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    fmov x9, d16
+; CHECK-NEXT:    scvtf s17, x12
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    fmov x13, d2
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    mov v5.s[1], v6.s[0]
+; CHECK-NEXT:    scvtf s6, x8
+; CHECK-NEXT:    scvtf s0, x12
+; CHECK-NEXT:    scvtf s18, x13
+; CHECK-NEXT:    mov x8, v16.d[1]
+; CHECK-NEXT:    mov v4.s[1], v7.s[0]
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    fmov x13, d3
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    mov x12, v3.d[1]
+; CHECK-NEXT:    mov v5.s[2], v17.s[0]
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov v18.s[1], v6.s[0]
+; CHECK-NEXT:    scvtf s2, x11
+; CHECK-NEXT:    scvtf s3, x13
+; CHECK-NEXT:    mov v4.s[2], v7.s[0]
+; CHECK-NEXT:    scvtf s6, x8
+; CHECK-NEXT:    mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v5.s[3], v2.s[0]
+; CHECK-NEXT:    scvtf s2, x12
+; CHECK-NEXT:    mov v18.s[2], v3.s[0]
+; CHECK-NEXT:    mov v4.s[3], v6.s[0]
+; CHECK-NEXT:    dup v3.4s, w8
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    fmul v1.4s, v5.4s, v3.4s
+; CHECK-NEXT:    mov v18.s[3], v2.s[0]
+; CHECK-NEXT:    fmul v2.4s, v4.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v18.4s
 ; CHECK-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1



More information about the llvm-commits mailing list