[llvm] [AArch64] Don't try to vectorize fixed point to fp narrowing conversion (PR #130665)

Pranav Kant via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 24 13:50:40 PDT 2025


https://github.com/pranavk updated https://github.com/llvm/llvm-project/pull/130665

>From 11f13142fbe99e175884687304bfbbeaf8495310 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Mon, 24 Mar 2025 17:27:32 +0000
Subject: [PATCH 1/2] init

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d0f51b73a4a44..1c8e3afdfd718 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5106,6 +5106,29 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   uint64_t VTSize = VT.getFixedSizeInBits();
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
+    // AArch64 doesn't have a direct vector instruction to convert
+    // fixed point to floating point AND narrow it at the same time.
+    // Additional rounding when the target is f32/f64 causes double
+    // rounding issues. Conversion to f16 is fine due to narrow width.
+    bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
+    bool IsTargetf16 = false;
+    if (Op.hasOneUse() &&
+        Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
+      // Some vector types are split during legalization into half, followed by
+      // concatenation, followed by rounding to the original vector type. If we
+      // end up resolving to f16 type, we shouldn't worry about rounding errors.
+      SDNode *U = *Op->user_begin();
+      if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
+        EVT TmpVT = U->user_begin()->getValueType(0);
+        if (TmpVT.getScalarType() == MVT::f16)
+          IsTargetf16 = true;
+      }
+    }
+
+    if (IsTargetf32 && !IsTargetf16) {
+      return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
+    }
+
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());

>From 04aafe43bbaeac48040d02dd24271aa3256133bd Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Mon, 24 Mar 2025 17:51:51 +0000
Subject: [PATCH 2/2] modify tests

---
 .../aarch64-neon-vector-insert-uaddlv.ll      |  32 +-
 .../CodeGen/AArch64/arm64-convert-v4f64.ll    |  33 +-
 .../CodeGen/AArch64/bf16-v4-instructions.ll   | 100 +-
 .../CodeGen/AArch64/bf16-v8-instructions.ll   | 190 ++--
 .../test/CodeGen/AArch64/complex-int-to-fp.ll |  25 +-
 .../fold-int-pow2-with-fmul-or-fdiv.ll        |  11 +-
 .../CodeGen/AArch64/fp-intrinsics-vector.ll   |  51 +-
 llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll      |  26 +-
 llvm/test/CodeGen/AArch64/itofp-bf16.ll       | 970 +++++++++++-------
 llvm/test/CodeGen/AArch64/itofp.ll            | 622 ++++++++---
 .../AArch64/sve-fixed-length-int-to-fp.ll     |  30 +-
 llvm/test/CodeGen/AArch64/vector-fcvt.ll      |  62 +-
 12 files changed, 1437 insertions(+), 715 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index b357a24f892ff..91eda8d552397 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -148,9 +148,9 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    str xzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.4s d1, v0
-; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -166,10 +166,11 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    uaddlv.4s d1, v0
-; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -187,9 +188,9 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    str wzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.4s d1, v0
-; CHECK-NEXT:    mov.d v0[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -254,9 +255,14 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
 ; CHECK-NEXT:    uaddlv.4h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.2d v1, v2
-; CHECK-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov.d x9, v2[1]
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    ucvtf s3, x9
+; CHECK-NEXT:    mov.s v2[0], v1[0]
+; CHECK-NEXT:    mov.s v2[1], v3[0]
+; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 508f68d6f14d4..2b9e334cc7812 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -53,20 +53,27 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
 define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
 ; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ucvtf s1, x9
+; CHECK-NEXT:    mov x9, v2.d[1]
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    ucvtf s2, x8
+; CHECK-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEXT:    ucvtf s0, x9
+; CHECK-NEXT:    mov v1.s[2], v2.s[0]
 ; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v1.4s, #16
+; CHECK-NEXT:    add v2.4s, v1.4s, v2.4s
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    fcmeq v3.4s, v1.4s, v1.4s
+; CHECK-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    bif v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 1cd0294b0083e..e185da3093645 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
 define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: sitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    mov x8, v0.d[1]
+; CHECK-CVT-NEXT:    fmov x9, d0
+; CHECK-CVT-NEXT:    scvtf s2, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    fmov x8, d1
+; CHECK-CVT-NEXT:    scvtf s1, x8
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x9
+; CHECK-CVT-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: sitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov x8, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    scvtf s2, x9
+; CHECK-BF16-NEXT:    mov x9, v1.d[1]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    fmov x8, d1
+; CHECK-BF16-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x8
+; CHECK-BF16-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <4 x i64> %a to <4 x bfloat>
   ret <4 x bfloat> %1
@@ -413,29 +427,43 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
 define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: uitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    mov x8, v0.d[1]
+; CHECK-CVT-NEXT:    fmov x9, d0
+; CHECK-CVT-NEXT:    ucvtf s2, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    ucvtf s0, x8
+; CHECK-CVT-NEXT:    fmov x8, d1
+; CHECK-CVT-NEXT:    ucvtf s1, x8
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    ucvtf s0, x9
+; CHECK-CVT-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: uitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov x8, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    ucvtf s2, x9
+; CHECK-BF16-NEXT:    mov x9, v1.d[1]
+; CHECK-BF16-NEXT:    ucvtf s0, x8
+; CHECK-BF16-NEXT:    fmov x8, d1
+; CHECK-BF16-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    ucvtf s0, x8
+; CHECK-BF16-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    ucvtf s0, x9
+; CHECK-BF16-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <4 x i64> %a to <4 x bfloat>
   ret <4 x bfloat> %1
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 2eaa58de92807..3a55b68f2d1a3 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -489,45 +489,74 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
 define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: sitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fmov x10, d2
+; CHECK-CVT-NEXT:    mov x8, v2.d[1]
+; CHECK-CVT-NEXT:    mov x9, v0.d[1]
+; CHECK-CVT-NEXT:    scvtf s2, x10
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    scvtf s0, x8
+; CHECK-CVT-NEXT:    scvtf s5, x9
+; CHECK-CVT-NEXT:    fmov x9, d3
+; CHECK-CVT-NEXT:    mov x8, v3.d[1]
+; CHECK-CVT-NEXT:    scvtf s4, x10
+; CHECK-CVT-NEXT:    fmov x10, d1
+; CHECK-CVT-NEXT:    scvtf s3, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    scvtf s0, x10
+; CHECK-CVT-NEXT:    scvtf s1, x8
+; CHECK-CVT-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-CVT-NEXT:    scvtf s3, x9
+; CHECK-CVT-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-CVT-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v1.16b, v5.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-CVT-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: sitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT:    mov x9, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x10, d0
+; CHECK-BF16-NEXT:    mov x8, v2.d[1]
+; CHECK-BF16-NEXT:    scvtf s4, x10
+; CHECK-BF16-NEXT:    fmov x10, d1
+; CHECK-BF16-NEXT:    scvtf s0, x9
+; CHECK-BF16-NEXT:    fmov x9, d2
+; CHECK-BF16-NEXT:    scvtf s2, x8
+; CHECK-BF16-NEXT:    mov x8, v1.d[1]
+; CHECK-BF16-NEXT:    scvtf s1, x9
+; CHECK-BF16-NEXT:    fmov x9, d3
+; CHECK-BF16-NEXT:    mov v4.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x10
+; CHECK-BF16-NEXT:    mov x10, v3.d[1]
+; CHECK-BF16-NEXT:    scvtf s3, x9
+; CHECK-BF16-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-BF16-NEXT:    scvtf s2, x8
+; CHECK-BF16-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    scvtf s0, x10
+; CHECK-BF16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-BF16-NEXT:    mov v4.s[3], v2.s[0]
+; CHECK-BF16-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v4.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
@@ -712,45 +741,74 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
 define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
 ; CHECK-CVT-LABEL: uitofp_i64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT:    movi v1.4s, #1
-; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fmov x10, d2
+; CHECK-CVT-NEXT:    mov x8, v2.d[1]
+; CHECK-CVT-NEXT:    mov x9, v0.d[1]
+; CHECK-CVT-NEXT:    ucvtf s2, x10
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    ucvtf s0, x8
+; CHECK-CVT-NEXT:    ucvtf s5, x9
+; CHECK-CVT-NEXT:    fmov x9, d3
+; CHECK-CVT-NEXT:    mov x8, v3.d[1]
+; CHECK-CVT-NEXT:    ucvtf s4, x10
+; CHECK-CVT-NEXT:    fmov x10, d1
+; CHECK-CVT-NEXT:    ucvtf s3, x9
+; CHECK-CVT-NEXT:    mov x9, v1.d[1]
+; CHECK-CVT-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT:    ucvtf s0, x10
+; CHECK-CVT-NEXT:    ucvtf s1, x8
+; CHECK-CVT-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-CVT-NEXT:    ucvtf s3, x9
+; CHECK-CVT-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-CVT-NEXT:    movi v0.4s, #1
+; CHECK-CVT-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-CVT-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v1.16b, v5.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-CVT-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: uitofp_i64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT:    mov x9, v0.d[1]
+; CHECK-BF16-NEXT:    fmov x10, d0
+; CHECK-BF16-NEXT:    mov x8, v2.d[1]
+; CHECK-BF16-NEXT:    ucvtf s4, x10
+; CHECK-BF16-NEXT:    fmov x10, d1
+; CHECK-BF16-NEXT:    ucvtf s0, x9
+; CHECK-BF16-NEXT:    fmov x9, d2
+; CHECK-BF16-NEXT:    ucvtf s2, x8
+; CHECK-BF16-NEXT:    mov x8, v1.d[1]
+; CHECK-BF16-NEXT:    ucvtf s1, x9
+; CHECK-BF16-NEXT:    fmov x9, d3
+; CHECK-BF16-NEXT:    mov v4.s[1], v0.s[0]
+; CHECK-BF16-NEXT:    ucvtf s0, x10
+; CHECK-BF16-NEXT:    mov x10, v3.d[1]
+; CHECK-BF16-NEXT:    ucvtf s3, x9
+; CHECK-BF16-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-BF16-NEXT:    ucvtf s2, x8
+; CHECK-BF16-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-BF16-NEXT:    ucvtf s0, x10
+; CHECK-BF16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-BF16-NEXT:    mov v4.s[3], v2.s[0]
+; CHECK-BF16-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v4.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
 ; CHECK-BF16-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x bfloat>
   ret <8 x bfloat> %1
diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
index ec504b4782547..baca159f9dd55 100644
--- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -5,9 +5,12 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
 ; CHECK-LABEL: autogen_SD19655:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    scvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov.s v1[1], v0[0]
+; CHECK-NEXT:    str d1, [x1]
 ; CHECK-NEXT:    ret
   %T = load <2 x i64>, ptr %addr
   %F = sitofp <2 x i64> %T to <2 x float>
@@ -88,8 +91,12 @@ define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone
 define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
 ; CHECK-LABEL: test_signed_v2i64_to_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 
   %conv = sitofp <2 x i64> %v to <2 x float>
@@ -98,8 +105,12 @@ define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
 define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
 ; CHECK-LABEL: test_unsigned_v2i64_to_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf.2d v0, v0
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    mov.d x8, v0[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ucvtf s0, x9
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 
   %conv = uitofp <2 x i64> %v to <2 x float>
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index b40c0656a60e4..b65334e2461fd 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -262,10 +262,13 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEON-NEXT:    dup v1.2d, x8
 ; CHECK-NEON-NEXT:    ushl v0.2d, v1.2d, v0.2d
-; CHECK-NEON-NEXT:    fmov v1.2s, #15.00000000
-; CHECK-NEON-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEON-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEON-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    mov x8, v0.d[1]
+; CHECK-NEON-NEXT:    fmov x9, d0
+; CHECK-NEON-NEXT:    ucvtf s1, x9
+; CHECK-NEON-NEXT:    ucvtf s0, x8
+; CHECK-NEON-NEXT:    mov v1.s[1], v0.s[0]
+; CHECK-NEON-NEXT:    fmov v0.2s, #15.00000000
+; CHECK-NEON-NEXT:    fmul v0.2s, v1.2s, v0.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-NO-NEON-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index 83e60c1089762..1364c47adff2d 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -193,10 +193,17 @@ define <4 x float> @uitofp_v4f32_v4i32(<4 x i32> %x) #0 {
 define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; CHECK-LABEL: sitofp_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    scvtf s2, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    ret
   %val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret <4 x float> %val
@@ -205,10 +212,38 @@ define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 define <4 x float> @uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; CHECK-LABEL: uitofp_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT:    ushr v3.2d, v1.2d, #32
+; CHECK-NEXT:    ushr v4.2d, v0.2d, #32
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    mov x9, v4.d[1]
+; CHECK-NEXT:    fmov x10, d3
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    fmov x11, d4
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    mov x10, v1.d[1]
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    scvtf s4, x11
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    dup v1.2s, w9
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-NEXT:    scvtf s6, x11
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov v6.s[1], v0.s[0]
+; CHECK-NEXT:    fmul v0.2s, v2.2s, v1.2s
+; CHECK-NEXT:    fmul v1.2s, v4.2s, v1.2s
+; CHECK-NEXT:    mov v5.s[1], v3.s[0]
+; CHECK-NEXT:    fadd v2.2s, v0.2s, v6.2s
+; CHECK-NEXT:    fadd v0.2s, v1.2s, v5.2s
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    ret
   %val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret <4 x float> %val
diff --git a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
index 0a7319b9ce11e..9da6f583cec01 100644
--- a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
+++ b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
@@ -210,15 +210,20 @@ define <1 x float> @scvtf_f32i64_simple(<1 x i64> %x) {
 ; CHECK-LABEL: scvtf_f32i64_simple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    scvtf s0, d0
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: scvtf_f32i64_simple:
 ; CHECK-NO-FPRCVT:       // %bb.0:
 ; CHECK-NO-FPRCVT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT:    fmov x8, d0
+; CHECK-NO-FPRCVT-NEXT:    movi d1, #0000000000000000
+; CHECK-NO-FPRCVT-NEXT:    scvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT:    fmov d0, d1
 ; CHECK-NO-FPRCVT-NEXT:    ret
  %conv = sitofp <1 x i64> %x to <1 x float>
  ret <1 x float> %conv
@@ -426,15 +431,20 @@ define <1 x float> @ucvtf_f32i64_simple(<1 x i64> %x) {
 ; CHECK-LABEL: ucvtf_f32i64_simple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    ucvtf s0, d0
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-FPRCVT-LABEL: ucvtf_f32i64_simple:
 ; CHECK-NO-FPRCVT:       // %bb.0:
 ; CHECK-NO-FPRCVT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT:    fmov x8, d0
+; CHECK-NO-FPRCVT-NEXT:    movi d1, #0000000000000000
+; CHECK-NO-FPRCVT-NEXT:    ucvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT:    fmov d0, d1
 ; CHECK-NO-FPRCVT-NEXT:    ret
  %conv = uitofp <1 x i64> %x to <1 x float>
  ret <1 x float> %conv
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
index 58591b11c184f..42641693c4081 100644
--- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -349,22 +349,27 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-LABEL: stofp_v3i64_v3bf16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v3.s[1], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v3.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v3.4s, #16
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    fcmeq v2.4s, v3.4s, v3.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -376,22 +381,27 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-LABEL: utofp_v3i64_v3bf16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov v3.s[0], v0.s[0]
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    mov v3.s[1], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v3.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v2.4s, v3.4s, #16
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    fcmeq v2.4s, v3.4s, v3.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -402,19 +412,26 @@ entry:
 define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK-LABEL: stofp_v4i64_v4bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s2, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -425,19 +442,26 @@ entry:
 define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK-LABEL: utofp_v4i64_v4bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ucvtf s2, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    ucvtf s0, x9
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -448,31 +472,46 @@ entry:
 define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-LABEL: stofp_v8i64_v8bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    fmov x9, d3
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    scvtf s3, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    scvtf s3, x9
+; CHECK-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <8 x i64> %a to <8 x bfloat>
@@ -482,31 +521,46 @@ entry:
 define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-LABEL: utofp_v8i64_v8bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    ucvtf s2, x10
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    ucvtf s5, x9
+; CHECK-NEXT:    fmov x9, d3
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    ucvtf s4, x10
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    ucvtf s3, x9
+; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-NEXT:    ucvtf s0, x10
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    ucvtf s3, x9
+; CHECK-NEXT:    mov v4.s[2], v0.s[0]
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    ushr v3.4s, v2.4s, #16
+; CHECK-NEXT:    add v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ushr v5.4s, v4.4s, #16
+; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v5.16b, v0.16b
 ; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    bif v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    bsl v1.16b, v3.16b, v2.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <8 x i64> %a to <8 x bfloat>
@@ -516,55 +570,82 @@ entry:
 define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: stofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x12, v6.d[1]
+; CHECK-NEXT:    scvtf s2, x11
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    scvtf s16, x8
+; CHECK-NEXT:    fmov x8, d6
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    scvtf s6, x12
+; CHECK-NEXT:    fmov x12, d4
+; CHECK-NEXT:    scvtf s4, x11
+; CHECK-NEXT:    scvtf s3, x8
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    mov v0.s[1], v16.s[0]
+; CHECK-NEXT:    scvtf s18, x10
+; CHECK-NEXT:    scvtf s19, x12
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    mov v2.s[1], v17.s[0]
+; CHECK-NEXT:    mov x12, v5.d[1]
+; CHECK-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-NEXT:    scvtf s6, x11
+; CHECK-NEXT:    fmov x11, d5
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov x10, v7.d[1]
+; CHECK-NEXT:    scvtf s7, x9
+; CHECK-NEXT:    mov v19.s[1], v18.s[0]
+; CHECK-NEXT:    scvtf s16, x8
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    scvtf s5, x11
+; CHECK-NEXT:    mov v3.s[2], v6.s[0]
+; CHECK-NEXT:    scvtf s4, x10
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x12
+; CHECK-NEXT:    mov v0.s[3], v7.s[0]
+; CHECK-NEXT:    mov v19.s[2], v5.s[0]
+; CHECK-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-NEXT:    mov v3.s[3], v4.s[0]
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    mov v19.s[3], v1.s[0]
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v3.4s, #16
+; CHECK-NEXT:    add v17.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v20.4s, v3.4s, v4.4s
+; CHECK-NEXT:    ushr v16.4s, v19.4s, #16
 ; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v16.16b, v1.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    fcmeq v16.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    fcmeq v18.4s, v3.4s, v3.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    add v7.4s, v7.4s, v20.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    fcmeq v4.4s, v19.4s, v19.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    orr v19.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v5.16b, v16.16b
+; CHECK-NEXT:    bit v2.16b, v6.16b, v17.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v18.16b
+; CHECK-NEXT:    bif v1.16b, v19.16b, v4.16b
+; CHECK-NEXT:    uzp2 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <16 x i64> %a to <16 x bfloat>
@@ -574,55 +655,82 @@ entry:
 define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: utofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn v6.2s, v6.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x12, v6.d[1]
+; CHECK-NEXT:    ucvtf s2, x11
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    ucvtf s16, x8
+; CHECK-NEXT:    fmov x8, d6
+; CHECK-NEXT:    ucvtf s0, x10
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    ucvtf s17, x9
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    ucvtf s6, x12
+; CHECK-NEXT:    fmov x12, d4
+; CHECK-NEXT:    ucvtf s4, x11
+; CHECK-NEXT:    ucvtf s3, x8
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    mov v0.s[1], v16.s[0]
+; CHECK-NEXT:    ucvtf s18, x10
+; CHECK-NEXT:    ucvtf s19, x12
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    mov v2.s[1], v17.s[0]
+; CHECK-NEXT:    mov x12, v5.d[1]
+; CHECK-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-NEXT:    ucvtf s6, x11
+; CHECK-NEXT:    fmov x11, d5
+; CHECK-NEXT:    ucvtf s1, x10
+; CHECK-NEXT:    mov x10, v7.d[1]
+; CHECK-NEXT:    ucvtf s7, x9
+; CHECK-NEXT:    mov v19.s[1], v18.s[0]
+; CHECK-NEXT:    ucvtf s16, x8
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    ucvtf s5, x11
+; CHECK-NEXT:    mov v3.s[2], v6.s[0]
+; CHECK-NEXT:    ucvtf s4, x10
+; CHECK-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-NEXT:    ucvtf s1, x12
+; CHECK-NEXT:    mov v0.s[3], v7.s[0]
+; CHECK-NEXT:    mov v19.s[2], v5.s[0]
+; CHECK-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-NEXT:    mov v3.s[3], v4.s[0]
+; CHECK-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-NEXT:    mov v19.s[3], v1.s[0]
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v3.4s, #16
+; CHECK-NEXT:    add v17.4s, v0.4s, v4.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v20.4s, v3.4s, v4.4s
+; CHECK-NEXT:    ushr v16.4s, v19.4s, #16
 ; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v16.16b, v1.16b
+; CHECK-NEXT:    add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    fcmeq v16.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    fcmeq v18.4s, v3.4s, v3.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v19.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT:    add v7.4s, v7.4s, v20.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    fcmeq v4.4s, v19.4s, v19.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    orr v19.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v5.16b, v16.16b
+; CHECK-NEXT:    bit v2.16b, v6.16b, v17.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v18.16b
+; CHECK-NEXT:    bif v1.16b, v19.16b, v4.16b
+; CHECK-NEXT:    uzp2 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <16 x i64> %a to <16 x bfloat>
@@ -632,107 +740,162 @@ entry:
 define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: stofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v17.2d, v2.2d
-; CHECK-NEXT:    scvtf v18.2d, v0.2d
-; CHECK-NEXT:    scvtf v19.2d, v3.2d
-; CHECK-NEXT:    scvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v6.2d, v7.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    scvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-NEXT:    scvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    scvtf v22.2d, v22.2d
-; CHECK-NEXT:    scvtf v21.2d, v23.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    scvtf s2, x10
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    scvtf s19, x9
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf s16, x11
+; CHECK-NEXT:    mov x11, v6.d[1]
+; CHECK-NEXT:    scvtf s0, x12
+; CHECK-NEXT:    scvtf s18, x8
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s20, x10
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    mov x9, v7.d[1]
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    scvtf s21, x11
+; CHECK-NEXT:    fmov x11, d6
+; CHECK-NEXT:    mov v2.s[1], v18.s[0]
+; CHECK-NEXT:    scvtf s25, x8
+; CHECK-NEXT:    movi v6.4s, #127, msl #8
+; CHECK-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-NEXT:    ldp q24, q20, [sp, #32]
+; CHECK-NEXT:    scvtf s22, x9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    scvtf s1, x11
+; CHECK-NEXT:    scvtf s26, x10
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-NEXT:    ldp q18, q16, [sp]
+; CHECK-NEXT:    mov x8, v24.d[1]
+; CHECK-NEXT:    scvtf s4, x9
+; CHECK-NEXT:    fmov x9, d5
+; CHECK-NEXT:    mov v0.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[1], v21.s[0]
+; CHECK-NEXT:    scvtf s23, x11
+; CHECK-NEXT:    mov x11, v5.d[1]
+; CHECK-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-NEXT:    scvtf s21, x8
+; CHECK-NEXT:    mov x8, v20.d[1]
+; CHECK-NEXT:    scvtf s17, x9
+; CHECK-NEXT:    fmov x9, d24
+; CHECK-NEXT:    mov v4.s[1], v26.s[0]
+; CHECK-NEXT:    mov v0.s[3], v25.s[0]
+; CHECK-NEXT:    ldp q26, q24, [sp, #96]
+; CHECK-NEXT:    mov v1.s[2], v23.s[0]
+; CHECK-NEXT:    ldp q25, q23, [sp, #64]
+; CHECK-NEXT:    scvtf s7, x11
+; CHECK-NEXT:    scvtf s27, x8
+; CHECK-NEXT:    fmov x8, d18
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    mov x10, v26.d[1]
+; CHECK-NEXT:    mov x9, v18.d[1]
+; CHECK-NEXT:    fmov x11, d20
+; CHECK-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[3], v22.s[0]
+; CHECK-NEXT:    ushr v19.4s, v2.4s, #16
+; CHECK-NEXT:    scvtf s17, x8
+; CHECK-NEXT:    fmov x8, d26
+; CHECK-NEXT:    add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT:    scvtf s22, x11
+; CHECK-NEXT:    mov x11, v25.d[1]
+; CHECK-NEXT:    mov v5.s[1], v21.s[0]
+; CHECK-NEXT:    scvtf s28, x10
+; CHECK-NEXT:    fmov x10, d16
+; CHECK-NEXT:    scvtf s21, x9
+; CHECK-NEXT:    fmov x9, d25
+; CHECK-NEXT:    scvtf s18, x8
+; CHECK-NEXT:    mov x8, v16.d[1]
+; CHECK-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-NEXT:    and v19.16b, v19.16b, v3.16b
+; CHECK-NEXT:    scvtf s16, x10
+; CHECK-NEXT:    fmov x10, d24
+; CHECK-NEXT:    scvtf s25, x11
+; CHECK-NEXT:    scvtf s20, x9
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    mov v17.s[1], v21.s[0]
+; CHECK-NEXT:    fmov x11, d23
+; CHECK-NEXT:    mov v18.s[1], v28.s[0]
+; CHECK-NEXT:    scvtf s24, x8
+; CHECK-NEXT:    scvtf s21, x10
+; CHECK-NEXT:    mov x10, v23.d[1]
+; CHECK-NEXT:    mov v5.s[2], v22.s[0]
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v0.4s, #16
+; CHECK-NEXT:    scvtf s23, x11
+; CHECK-NEXT:    mov v20.s[1], v25.s[0]
+; CHECK-NEXT:    scvtf s25, x9
+; CHECK-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-NEXT:    add v16.4s, v19.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v4.4s, #16
+; CHECK-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-NEXT:    scvtf s7, x10
+; CHECK-NEXT:    and v22.16b, v22.16b, v3.16b
+; CHECK-NEXT:    mov v5.s[3], v27.s[0]
+; CHECK-NEXT:    and v21.16b, v28.16b, v3.16b
+; CHECK-NEXT:    fcmeq v19.4s, v2.4s, v2.4s
+; CHECK-NEXT:    mov v20.s[2], v23.s[0]
+; CHECK-NEXT:    add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    mov v17.s[3], v24.s[0]
+; CHECK-NEXT:    add v24.4s, v1.4s, v6.4s
+; CHECK-NEXT:    fcmeq v27.4s, v1.4s, v1.4s
+; CHECK-NEXT:    mov v18.s[3], v25.s[0]
+; CHECK-NEXT:    add v25.4s, v4.4s, v6.4s
 ; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    bit v2.16b, v16.16b, v19.16b
+; CHECK-NEXT:    mov v20.s[3], v7.s[0]
+; CHECK-NEXT:    add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT:    add v7.4s, v21.4s, v23.4s
+; CHECK-NEXT:    ushr v24.4s, v17.4s, #16
+; CHECK-NEXT:    and v23.16b, v26.16b, v3.16b
+; CHECK-NEXT:    ushr v26.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v17.4s, v6.4s
+; CHECK-NEXT:    add v31.4s, v18.4s, v6.4s
+; CHECK-NEXT:    fcmeq v21.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    bit v1.16b, v22.16b, v27.16b
+; CHECK-NEXT:    ushr v29.4s, v20.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v3.16b
 ; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    and v28.16b, v28.16b, v3.16b
+; CHECK-NEXT:    and v25.16b, v26.16b, v3.16b
+; CHECK-NEXT:    add v26.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT:    and v3.16b, v29.16b, v3.16b
 ; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT:    fcmeq v30.4s, v17.4s, v17.4s
 ; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v31.4s, v18.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v5.4s, v5.4s
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v20.4s, v20.4s
+; CHECK-NEXT:    orr v5.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    orr v20.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v7.16b, v21.16b
+; CHECK-NEXT:    mov v7.16b, v30.16b
+; CHECK-NEXT:    mov v16.16b, v31.16b
 ; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    bit v5.16b, v25.16b, v26.16b
+; CHECK-NEXT:    bif v3.16b, v20.16b, v6.16b
+; CHECK-NEXT:    bsl v7.16b, v24.16b, v17.16b
+; CHECK-NEXT:    bsl v16.16b, v28.16b, v18.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v1.8h
+; CHECK-NEXT:    uzp2 v2.8h, v7.8h, v5.8h
+; CHECK-NEXT:    uzp2 v3.8h, v3.8h, v16.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i64> %a to <32 x bfloat>
@@ -742,107 +905,162 @@ entry:
 define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: utofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v17.2d, v2.2d
-; CHECK-NEXT:    ucvtf v18.2d, v0.2d
-; CHECK-NEXT:    ucvtf v19.2d, v3.2d
-; CHECK-NEXT:    ucvtf v3.2d, v6.2d
-; CHECK-NEXT:    ldp q21, q20, [sp, #32]
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v6.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #64]
-; CHECK-NEXT:    movi v16.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v17.2d
-; CHECK-NEXT:    ucvtf v17.2d, v1.2d
-; CHECK-NEXT:    fcvtn v1.2s, v18.2d
-; CHECK-NEXT:    fcvtn v3.2s, v3.2d
-; CHECK-NEXT:    ldp q18, q7, [sp]
-; CHECK-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    movi v2.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT:    ldp q22, q19, [sp, #96]
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-NEXT:    ucvtf v17.2d, v24.2d
-; CHECK-NEXT:    fcvtn v6.2s, v21.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    ucvtf v22.2d, v22.2d
-; CHECK-NEXT:    ucvtf v21.2d, v23.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
-; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
-; CHECK-NEXT:    fcvtn v18.2s, v18.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcvtn v17.2s, v17.2d
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    fcvtn v22.2s, v22.2d
-; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT:    orr v3.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
-; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    mov x9, v3.d[1]
+; CHECK-NEXT:    mov x8, v2.d[1]
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    ucvtf s2, x10
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    ucvtf s19, x9
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    ucvtf s16, x11
+; CHECK-NEXT:    mov x11, v6.d[1]
+; CHECK-NEXT:    ucvtf s0, x12
+; CHECK-NEXT:    ucvtf s18, x8
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    ucvtf s20, x10
+; CHECK-NEXT:    ucvtf s17, x9
+; CHECK-NEXT:    mov x9, v7.d[1]
+; CHECK-NEXT:    mov x10, v4.d[1]
+; CHECK-NEXT:    ucvtf s21, x11
+; CHECK-NEXT:    fmov x11, d6
+; CHECK-NEXT:    mov v2.s[1], v18.s[0]
+; CHECK-NEXT:    ucvtf s25, x8
+; CHECK-NEXT:    movi v6.4s, #127, msl #8
+; CHECK-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-NEXT:    ldp q24, q20, [sp, #32]
+; CHECK-NEXT:    ucvtf s22, x9
+; CHECK-NEXT:    fmov x9, d4
+; CHECK-NEXT:    ucvtf s1, x11
+; CHECK-NEXT:    ucvtf s26, x10
+; CHECK-NEXT:    fmov x11, d7
+; CHECK-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-NEXT:    ldp q18, q16, [sp]
+; CHECK-NEXT:    mov x8, v24.d[1]
+; CHECK-NEXT:    ucvtf s4, x9
+; CHECK-NEXT:    fmov x9, d5
+; CHECK-NEXT:    mov v0.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[1], v21.s[0]
+; CHECK-NEXT:    ucvtf s23, x11
+; CHECK-NEXT:    mov x11, v5.d[1]
+; CHECK-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-NEXT:    ucvtf s21, x8
+; CHECK-NEXT:    mov x8, v20.d[1]
+; CHECK-NEXT:    ucvtf s17, x9
+; CHECK-NEXT:    fmov x9, d24
+; CHECK-NEXT:    mov v4.s[1], v26.s[0]
+; CHECK-NEXT:    mov v0.s[3], v25.s[0]
+; CHECK-NEXT:    ldp q26, q24, [sp, #96]
+; CHECK-NEXT:    mov v1.s[2], v23.s[0]
+; CHECK-NEXT:    ldp q25, q23, [sp, #64]
+; CHECK-NEXT:    ucvtf s7, x11
+; CHECK-NEXT:    ucvtf s27, x8
+; CHECK-NEXT:    fmov x8, d18
+; CHECK-NEXT:    ucvtf s5, x9
+; CHECK-NEXT:    mov x10, v26.d[1]
+; CHECK-NEXT:    mov x9, v18.d[1]
+; CHECK-NEXT:    fmov x11, d20
+; CHECK-NEXT:    mov v4.s[2], v17.s[0]
+; CHECK-NEXT:    mov v1.s[3], v22.s[0]
+; CHECK-NEXT:    ushr v19.4s, v2.4s, #16
+; CHECK-NEXT:    ucvtf s17, x8
+; CHECK-NEXT:    fmov x8, d26
+; CHECK-NEXT:    add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT:    ucvtf s22, x11
+; CHECK-NEXT:    mov x11, v25.d[1]
+; CHECK-NEXT:    mov v5.s[1], v21.s[0]
+; CHECK-NEXT:    ucvtf s28, x10
+; CHECK-NEXT:    fmov x10, d16
+; CHECK-NEXT:    ucvtf s21, x9
+; CHECK-NEXT:    fmov x9, d25
+; CHECK-NEXT:    ucvtf s18, x8
+; CHECK-NEXT:    mov x8, v16.d[1]
+; CHECK-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-NEXT:    and v19.16b, v19.16b, v3.16b
+; CHECK-NEXT:    ucvtf s16, x10
+; CHECK-NEXT:    fmov x10, d24
+; CHECK-NEXT:    ucvtf s25, x11
+; CHECK-NEXT:    ucvtf s20, x9
+; CHECK-NEXT:    mov x9, v24.d[1]
+; CHECK-NEXT:    mov v17.s[1], v21.s[0]
+; CHECK-NEXT:    fmov x11, d23
+; CHECK-NEXT:    mov v18.s[1], v28.s[0]
+; CHECK-NEXT:    ucvtf s24, x8
+; CHECK-NEXT:    ucvtf s21, x10
+; CHECK-NEXT:    mov x10, v23.d[1]
+; CHECK-NEXT:    mov v5.s[2], v22.s[0]
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v0.4s, #16
+; CHECK-NEXT:    ucvtf s23, x11
+; CHECK-NEXT:    mov v20.s[1], v25.s[0]
+; CHECK-NEXT:    ucvtf s25, x9
+; CHECK-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-NEXT:    add v16.4s, v19.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v4.4s, #16
+; CHECK-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-NEXT:    ucvtf s7, x10
+; CHECK-NEXT:    and v22.16b, v22.16b, v3.16b
+; CHECK-NEXT:    mov v5.s[3], v27.s[0]
+; CHECK-NEXT:    and v21.16b, v28.16b, v3.16b
+; CHECK-NEXT:    fcmeq v19.4s, v2.4s, v2.4s
+; CHECK-NEXT:    mov v20.s[2], v23.s[0]
+; CHECK-NEXT:    add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-NEXT:    mov v17.s[3], v24.s[0]
+; CHECK-NEXT:    add v24.4s, v1.4s, v6.4s
+; CHECK-NEXT:    fcmeq v27.4s, v1.4s, v1.4s
+; CHECK-NEXT:    mov v18.s[3], v25.s[0]
+; CHECK-NEXT:    add v25.4s, v4.4s, v6.4s
 ; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
-; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
-; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    bit v2.16b, v16.16b, v19.16b
+; CHECK-NEXT:    mov v20.s[3], v7.s[0]
+; CHECK-NEXT:    add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT:    add v7.4s, v21.4s, v23.4s
+; CHECK-NEXT:    ushr v24.4s, v17.4s, #16
+; CHECK-NEXT:    and v23.16b, v26.16b, v3.16b
+; CHECK-NEXT:    ushr v26.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v28.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v17.4s, v6.4s
+; CHECK-NEXT:    add v31.4s, v18.4s, v6.4s
+; CHECK-NEXT:    fcmeq v21.4s, v0.4s, v0.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-NEXT:    bit v1.16b, v22.16b, v27.16b
+; CHECK-NEXT:    ushr v29.4s, v20.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v3.16b
 ; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
-; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v6.4s, #64, lsl #16
-; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    and v28.16b, v28.16b, v3.16b
+; CHECK-NEXT:    and v25.16b, v26.16b, v3.16b
+; CHECK-NEXT:    add v26.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT:    and v3.16b, v29.16b, v3.16b
 ; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT:    fcmeq v30.4s, v17.4s, v17.4s
 ; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v31.4s, v18.4s, v18.4s
 ; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v5.4s, v5.4s
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v22.4s, #64, lsl #16
-; CHECK-NEXT:    mov v5.16b, v26.16b
-; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    fcmeq v6.4s, v20.4s, v20.4s
+; CHECK-NEXT:    orr v5.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v17.4s, #64, lsl #16
-; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    orr v18.4s, #64, lsl #16
+; CHECK-NEXT:    orr v20.4s, #64, lsl #16
+; CHECK-NEXT:    bit v0.16b, v7.16b, v21.16b
+; CHECK-NEXT:    mov v7.16b, v30.16b
+; CHECK-NEXT:    mov v16.16b, v31.16b
 ; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT:    mov v6.16b, v30.16b
-; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT:    bit v5.16b, v25.16b, v26.16b
+; CHECK-NEXT:    bif v3.16b, v20.16b, v6.16b
+; CHECK-NEXT:    bsl v7.16b, v24.16b, v17.16b
+; CHECK-NEXT:    bsl v16.16b, v28.16b, v18.16b
+; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v1.8h
+; CHECK-NEXT:    uzp2 v2.8h, v7.8h, v5.8h
+; CHECK-NEXT:    uzp2 v3.8h, v3.8h, v16.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i64> %a to <32 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 81c1a64f2d434..07957c117868d 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4421,22 +4421,42 @@ entry:
 }
 
 define <2 x float> @stofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: stofp_v2i64_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v2i64_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v2i64_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %c
 }
 
 define <2 x float> @utofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: utofp_v2i64_v2f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v2i64_v2f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    ucvtf s0, x9
+; CHECK-SD-NEXT:    ucvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v2i64_v2f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %c
@@ -4446,13 +4466,18 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v3i64_v3f32:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    scvtf s3, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    fmov x8, d2
+; CHECK-SD-NEXT:    mov v0.s[0], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v3i64_v3f32:
@@ -4478,13 +4503,18 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-SD-LABEL: utofp_v3i64_v3f32:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ucvtf s3, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    ucvtf s1, x8
+; CHECK-SD-NEXT:    fmov x8, d2
+; CHECK-SD-NEXT:    mov v0.s[0], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    ucvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v3i64_v3f32:
@@ -4507,26 +4537,56 @@ entry:
 }
 
 define <4 x float> @stofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: stofp_v4i64_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v4i64_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    scvtf s0, x9
+; CHECK-SD-NEXT:    mov x9, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    scvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v4i64_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %c
 }
 
 define <4 x float> @utofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: utofp_v4i64_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v4i64_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x9, d0
+; CHECK-SD-NEXT:    ucvtf s0, x9
+; CHECK-SD-NEXT:    mov x9, v1.d[1]
+; CHECK-SD-NEXT:    ucvtf s2, x8
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    ucvtf s1, x8
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    ucvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v4i64_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %c
@@ -4535,14 +4595,29 @@ entry:
 define <8 x float> @stofp_v8i64_v8f32(<8 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v8i64_v8f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    scvtf v4.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    scvtf s0, x10
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    scvtf s2, x11
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    fmov x11, d3
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-SD-NEXT:    scvtf s3, x11
+; CHECK-SD-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x10
+; CHECK-SD-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v8i64_v8f32:
@@ -4564,14 +4639,29 @@ entry:
 define <8 x float> @utofp_v8i64_v8f32(<8 x i64> %a) {
 ; CHECK-SD-LABEL: utofp_v8i64_v8f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ucvtf v4.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    ucvtf s0, x10
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    ucvtf s4, x8
+; CHECK-SD-NEXT:    ucvtf s5, x9
+; CHECK-SD-NEXT:    ucvtf s2, x11
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    fmov x11, d3
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    ucvtf s1, x9
+; CHECK-SD-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-SD-NEXT:    ucvtf s3, x11
+; CHECK-SD-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-SD-NEXT:    ucvtf s4, x8
+; CHECK-SD-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT:    ucvtf s1, x10
+; CHECK-SD-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v8i64_v8f32:
@@ -4591,50 +4681,148 @@ entry:
 }
 
 define <16 x float> @stofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: stofp_v16i64_v16f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v16.2d, v1.2d
-; CHECK-NEXT:    scvtf v17.2d, v3.2d
-; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: stofp_v16i64_v16f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    mov x10, v2.d[1]
+; CHECK-SD-NEXT:    fmov x11, d0
+; CHECK-SD-NEXT:    mov x12, v4.d[1]
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    scvtf s0, x11
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    scvtf s17, x9
+; CHECK-SD-NEXT:    scvtf s18, x10
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    scvtf s1, x12
+; CHECK-SD-NEXT:    fmov x12, d6
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    fmov x13, d3
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x11
+; CHECK-SD-NEXT:    mov v0.s[1], v17.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x9
+; CHECK-SD-NEXT:    scvtf s3, x12
+; CHECK-SD-NEXT:    mov v16.s[1], v18.s[0]
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    scvtf s5, x13
+; CHECK-SD-NEXT:    fmov x13, d7
+; CHECK-SD-NEXT:    mov x12, v7.d[1]
+; CHECK-SD-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x10
+; CHECK-SD-NEXT:    scvtf s7, x11
+; CHECK-SD-NEXT:    scvtf s1, x13
+; CHECK-SD-NEXT:    mov v3.s[1], v4.s[0]
+; CHECK-SD-NEXT:    mov v16.s[2], v5.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x8
+; CHECK-SD-NEXT:    scvtf s5, x9
+; CHECK-SD-NEXT:    mov v2.s[2], v7.s[0]
+; CHECK-SD-NEXT:    mov v3.s[2], v1.s[0]
+; CHECK-SD-NEXT:    scvtf s1, x12
+; CHECK-SD-NEXT:    mov v16.s[3], v6.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v16.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: stofp_v16i64_v16f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-GI-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-GI-NEXT:    scvtf v16.2d, v1.2d
+; CHECK-GI-NEXT:    scvtf v17.2d, v3.2d
+; CHECK-GI-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-GI-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT:    fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT:    fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT:    fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %c
 }
 
 define <16 x float> @utofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: utofp_v16i64_v16f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v16.2d, v1.2d
-; CHECK-NEXT:    ucvtf v17.2d, v3.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT:    fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: utofp_v16i64_v16f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x13, d2
+; CHECK-SD-NEXT:    mov x9, v0.d[1]
+; CHECK-SD-NEXT:    mov x10, v2.d[1]
+; CHECK-SD-NEXT:    fmov x11, d0
+; CHECK-SD-NEXT:    mov x12, v4.d[1]
+; CHECK-SD-NEXT:    mov x8, v1.d[1]
+; CHECK-SD-NEXT:    ucvtf s16, x13
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    ucvtf s0, x11
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    ucvtf s17, x9
+; CHECK-SD-NEXT:    ucvtf s18, x10
+; CHECK-SD-NEXT:    fmov x9, d1
+; CHECK-SD-NEXT:    ucvtf s1, x12
+; CHECK-SD-NEXT:    fmov x12, d6
+; CHECK-SD-NEXT:    ucvtf s2, x13
+; CHECK-SD-NEXT:    fmov x13, d3
+; CHECK-SD-NEXT:    mov x10, v3.d[1]
+; CHECK-SD-NEXT:    ucvtf s4, x11
+; CHECK-SD-NEXT:    mov v0.s[1], v17.s[0]
+; CHECK-SD-NEXT:    ucvtf s6, x9
+; CHECK-SD-NEXT:    ucvtf s3, x12
+; CHECK-SD-NEXT:    mov v16.s[1], v18.s[0]
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    ucvtf s5, x13
+; CHECK-SD-NEXT:    fmov x13, d7
+; CHECK-SD-NEXT:    mov x12, v7.d[1]
+; CHECK-SD-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-SD-NEXT:    ucvtf s6, x10
+; CHECK-SD-NEXT:    ucvtf s7, x11
+; CHECK-SD-NEXT:    ucvtf s1, x13
+; CHECK-SD-NEXT:    mov v3.s[1], v4.s[0]
+; CHECK-SD-NEXT:    mov v16.s[2], v5.s[0]
+; CHECK-SD-NEXT:    ucvtf s4, x8
+; CHECK-SD-NEXT:    ucvtf s5, x9
+; CHECK-SD-NEXT:    mov v2.s[2], v7.s[0]
+; CHECK-SD-NEXT:    mov v3.s[2], v1.s[0]
+; CHECK-SD-NEXT:    ucvtf s1, x12
+; CHECK-SD-NEXT:    mov v16.s[3], v6.s[0]
+; CHECK-SD-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT:    mov v1.16b, v16.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: utofp_v16i64_v16f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-GI-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-GI-NEXT:    ucvtf v16.2d, v1.2d
+; CHECK-GI-NEXT:    ucvtf v17.2d, v3.2d
+; CHECK-GI-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-GI-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT:    fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT:    fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT:    fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %c
@@ -4643,42 +4831,99 @@ entry:
 define <32 x float> @stofp_v32i64_v32f32(<32 x i64> %a) {
 ; CHECK-SD-LABEL: stofp_v32i64_v32f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ldp q21, q20, [sp]
-; CHECK-SD-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT:    ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-SD-NEXT:    scvtf v17.2d, v17.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    scvtf v24.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    scvtf v23.2d, v23.2d
-; CHECK-SD-NEXT:    scvtf v25.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT:    scvtf v26.2d, v5.2d
-; CHECK-SD-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT:    scvtf v27.2d, v7.2d
-; CHECK-SD-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-SD-NEXT:    fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-SD-NEXT:    fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT:    fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT:    scvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT:    scvtf v17.2d, v22.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT:    fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT:    fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT:    fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT:    fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT:    fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT:    fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov v16.16b, v1.16b
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    ldp q24, q20, [sp]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x12, d3
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    scvtf s0, x10
+; CHECK-SD-NEXT:    ldp q21, q18, [sp, #32]
+; CHECK-SD-NEXT:    scvtf s2, x8
+; CHECK-SD-NEXT:    scvtf s1, x11
+; CHECK-SD-NEXT:    mov x10, v4.d[1]
+; CHECK-SD-NEXT:    fmov x11, d16
+; CHECK-SD-NEXT:    ldp q19, q17, [sp, #96]
+; CHECK-SD-NEXT:    scvtf s22, x9
+; CHECK-SD-NEXT:    mov x8, v3.d[1]
+; CHECK-SD-NEXT:    scvtf s4, x12
+; CHECK-SD-NEXT:    mov x12, v24.d[1]
+; CHECK-SD-NEXT:    mov x9, v16.d[1]
+; CHECK-SD-NEXT:    scvtf s3, x11
+; CHECK-SD-NEXT:    ldp q23, q16, [sp, #64]
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    scvtf s25, x10
+; CHECK-SD-NEXT:    fmov x10, d6
+; CHECK-SD-NEXT:    mov v1.s[1], v22.s[0]
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    scvtf s2, x13
+; CHECK-SD-NEXT:    mov x13, v21.d[1]
+; CHECK-SD-NEXT:    fmov x14, d19
+; CHECK-SD-NEXT:    scvtf s22, x9
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x15, d17
+; CHECK-SD-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-SD-NEXT:    scvtf s3, x10
+; CHECK-SD-NEXT:    fmov x10, d24
+; CHECK-SD-NEXT:    mov v1.s[2], v4.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x12
+; CHECK-SD-NEXT:    scvtf s6, x11
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    fmov x12, d7
+; CHECK-SD-NEXT:    mov v2.s[1], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s4, x10
+; CHECK-SD-NEXT:    fmov x10, d21
+; CHECK-SD-NEXT:    scvtf s21, x8
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    scvtf s25, x13
+; CHECK-SD-NEXT:    mov x13, v19.d[1]
+; CHECK-SD-NEXT:    scvtf s26, x11
+; CHECK-SD-NEXT:    mov x11, v20.d[1]
+; CHECK-SD-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-SD-NEXT:    scvtf s5, x10
+; CHECK-SD-NEXT:    mov x10, v7.d[1]
+; CHECK-SD-NEXT:    scvtf s7, x14
+; CHECK-SD-NEXT:    mov v4.s[1], v24.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x12
+; CHECK-SD-NEXT:    fmov x12, d20
+; CHECK-SD-NEXT:    scvtf s20, x8
+; CHECK-SD-NEXT:    fmov x8, d23
+; CHECK-SD-NEXT:    scvtf s19, x13
+; CHECK-SD-NEXT:    fmov x13, d18
+; CHECK-SD-NEXT:    fmov x14, d16
+; CHECK-SD-NEXT:    mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT:    mov v5.s[1], v25.s[0]
+; CHECK-SD-NEXT:    scvtf s23, x10
+; CHECK-SD-NEXT:    mov v0.s[3], v22.s[0]
+; CHECK-SD-NEXT:    scvtf s6, x8
+; CHECK-SD-NEXT:    mov x8, v18.d[1]
+; CHECK-SD-NEXT:    scvtf s18, x12
+; CHECK-SD-NEXT:    mov x12, v16.d[1]
+; CHECK-SD-NEXT:    scvtf s16, x13
+; CHECK-SD-NEXT:    mov x13, v17.d[1]
+; CHECK-SD-NEXT:    scvtf s17, x14
+; CHECK-SD-NEXT:    mov v7.s[1], v19.s[0]
+; CHECK-SD-NEXT:    scvtf s19, x9
+; CHECK-SD-NEXT:    mov v3.s[2], v24.s[0]
+; CHECK-SD-NEXT:    scvtf s24, x11
+; CHECK-SD-NEXT:    mov v1.s[3], v21.s[0]
+; CHECK-SD-NEXT:    mov v6.s[1], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s20, x15
+; CHECK-SD-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-SD-NEXT:    scvtf s18, x8
+; CHECK-SD-NEXT:    mov v5.s[2], v16.s[0]
+; CHECK-SD-NEXT:    scvtf s16, x12
+; CHECK-SD-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v23.s[0]
+; CHECK-SD-NEXT:    mov v6.s[2], v17.s[0]
+; CHECK-SD-NEXT:    mov v7.s[2], v20.s[0]
+; CHECK-SD-NEXT:    scvtf s17, x13
+; CHECK-SD-NEXT:    mov v4.s[3], v24.s[0]
+; CHECK-SD-NEXT:    mov v5.s[3], v18.s[0]
+; CHECK-SD-NEXT:    mov v6.s[3], v16.s[0]
+; CHECK-SD-NEXT:    mov v7.s[3], v17.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v32i64_v32f32:
@@ -4728,42 +4973,99 @@ entry:
 define <32 x float> @utofp_v32i64_v32f32(<32 x i64> %a) {
 ; CHECK-SD-LABEL: utofp_v32i64_v32f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    ldp q21, q20, [sp]
-; CHECK-SD-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-SD-NEXT:    ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-SD-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    ucvtf v24.2d, v1.2d
-; CHECK-SD-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT:    ucvtf v23.2d, v23.2d
-; CHECK-SD-NEXT:    ucvtf v25.2d, v3.2d
-; CHECK-SD-NEXT:    fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT:    ucvtf v26.2d, v5.2d
-; CHECK-SD-NEXT:    fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT:    ucvtf v27.2d, v7.2d
-; CHECK-SD-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-SD-NEXT:    fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-SD-NEXT:    fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT:    fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT:    ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT:    ucvtf v17.2d, v22.2d
-; CHECK-SD-NEXT:    fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT:    fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT:    fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT:    fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT:    fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT:    fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT:    fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT:    fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    fmov x10, d0
+; CHECK-SD-NEXT:    mov v16.16b, v1.16b
+; CHECK-SD-NEXT:    fmov x11, d2
+; CHECK-SD-NEXT:    ldp q24, q20, [sp]
+; CHECK-SD-NEXT:    mov x9, v2.d[1]
+; CHECK-SD-NEXT:    fmov x12, d3
+; CHECK-SD-NEXT:    fmov x13, d4
+; CHECK-SD-NEXT:    ucvtf s0, x10
+; CHECK-SD-NEXT:    ldp q21, q18, [sp, #32]
+; CHECK-SD-NEXT:    ucvtf s2, x8
+; CHECK-SD-NEXT:    ucvtf s1, x11
+; CHECK-SD-NEXT:    mov x10, v4.d[1]
+; CHECK-SD-NEXT:    fmov x11, d16
+; CHECK-SD-NEXT:    ldp q19, q17, [sp, #96]
+; CHECK-SD-NEXT:    ucvtf s22, x9
+; CHECK-SD-NEXT:    mov x8, v3.d[1]
+; CHECK-SD-NEXT:    ucvtf s4, x12
+; CHECK-SD-NEXT:    mov x12, v24.d[1]
+; CHECK-SD-NEXT:    mov x9, v16.d[1]
+; CHECK-SD-NEXT:    ucvtf s3, x11
+; CHECK-SD-NEXT:    ldp q23, q16, [sp, #64]
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    ucvtf s25, x10
+; CHECK-SD-NEXT:    fmov x10, d6
+; CHECK-SD-NEXT:    mov v1.s[1], v22.s[0]
+; CHECK-SD-NEXT:    mov x11, v6.d[1]
+; CHECK-SD-NEXT:    ucvtf s2, x13
+; CHECK-SD-NEXT:    mov x13, v21.d[1]
+; CHECK-SD-NEXT:    fmov x14, d19
+; CHECK-SD-NEXT:    ucvtf s22, x9
+; CHECK-SD-NEXT:    mov x9, v5.d[1]
+; CHECK-SD-NEXT:    fmov x15, d17
+; CHECK-SD-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-SD-NEXT:    ucvtf s3, x10
+; CHECK-SD-NEXT:    fmov x10, d24
+; CHECK-SD-NEXT:    mov v1.s[2], v4.s[0]
+; CHECK-SD-NEXT:    ucvtf s24, x12
+; CHECK-SD-NEXT:    ucvtf s6, x11
+; CHECK-SD-NEXT:    fmov x11, d5
+; CHECK-SD-NEXT:    fmov x12, d7
+; CHECK-SD-NEXT:    mov v2.s[1], v25.s[0]
+; CHECK-SD-NEXT:    ucvtf s4, x10
+; CHECK-SD-NEXT:    fmov x10, d21
+; CHECK-SD-NEXT:    ucvtf s21, x8
+; CHECK-SD-NEXT:    mov x8, v23.d[1]
+; CHECK-SD-NEXT:    ucvtf s25, x13
+; CHECK-SD-NEXT:    mov x13, v19.d[1]
+; CHECK-SD-NEXT:    ucvtf s26, x11
+; CHECK-SD-NEXT:    mov x11, v20.d[1]
+; CHECK-SD-NEXT:    mov v3.s[1], v6.s[0]
+; CHECK-SD-NEXT:    ucvtf s5, x10
+; CHECK-SD-NEXT:    mov x10, v7.d[1]
+; CHECK-SD-NEXT:    ucvtf s7, x14
+; CHECK-SD-NEXT:    mov v4.s[1], v24.s[0]
+; CHECK-SD-NEXT:    ucvtf s24, x12
+; CHECK-SD-NEXT:    fmov x12, d20
+; CHECK-SD-NEXT:    ucvtf s20, x8
+; CHECK-SD-NEXT:    fmov x8, d23
+; CHECK-SD-NEXT:    ucvtf s19, x13
+; CHECK-SD-NEXT:    fmov x13, d18
+; CHECK-SD-NEXT:    fmov x14, d16
+; CHECK-SD-NEXT:    mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT:    mov v5.s[1], v25.s[0]
+; CHECK-SD-NEXT:    ucvtf s23, x10
+; CHECK-SD-NEXT:    mov v0.s[3], v22.s[0]
+; CHECK-SD-NEXT:    ucvtf s6, x8
+; CHECK-SD-NEXT:    mov x8, v18.d[1]
+; CHECK-SD-NEXT:    ucvtf s18, x12
+; CHECK-SD-NEXT:    mov x12, v16.d[1]
+; CHECK-SD-NEXT:    ucvtf s16, x13
+; CHECK-SD-NEXT:    mov x13, v17.d[1]
+; CHECK-SD-NEXT:    ucvtf s17, x14
+; CHECK-SD-NEXT:    mov v7.s[1], v19.s[0]
+; CHECK-SD-NEXT:    ucvtf s19, x9
+; CHECK-SD-NEXT:    mov v3.s[2], v24.s[0]
+; CHECK-SD-NEXT:    ucvtf s24, x11
+; CHECK-SD-NEXT:    mov v1.s[3], v21.s[0]
+; CHECK-SD-NEXT:    mov v6.s[1], v20.s[0]
+; CHECK-SD-NEXT:    ucvtf s20, x15
+; CHECK-SD-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-SD-NEXT:    ucvtf s18, x8
+; CHECK-SD-NEXT:    mov v5.s[2], v16.s[0]
+; CHECK-SD-NEXT:    ucvtf s16, x12
+; CHECK-SD-NEXT:    mov v2.s[3], v19.s[0]
+; CHECK-SD-NEXT:    mov v3.s[3], v23.s[0]
+; CHECK-SD-NEXT:    mov v6.s[2], v17.s[0]
+; CHECK-SD-NEXT:    mov v7.s[2], v20.s[0]
+; CHECK-SD-NEXT:    ucvtf s17, x13
+; CHECK-SD-NEXT:    mov v4.s[3], v24.s[0]
+; CHECK-SD-NEXT:    mov v5.s[3], v18.s[0]
+; CHECK-SD-NEXT:    mov v6.s[3], v16.s[0]
+; CHECK-SD-NEXT:    mov v7.s[3], v17.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v32i64_v32f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 573fe3d8b8a77..1d9e01f4ecfdf 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -722,8 +722,11 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
   %res = uitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
@@ -733,8 +736,12 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ucvtf s0, x9
+; CHECK-NEXT:    ucvtf s1, x8
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
@@ -1646,8 +1653,11 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v1i64_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
   %res = sitofp <1 x i64> %op1 to <1 x float>
   ret <1 x float> %res
@@ -1657,8 +1667,12 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    scvtf s0, x9
+; CHECK-NEXT:    scvtf s1, x8
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index 8f38bdbedc629..a6b43d514594e 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -87,14 +87,29 @@ define <8 x float> @sitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @sitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: sitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v4.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    scvtf v2.2d, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    mov x9, v2.d[1]
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x11, d2
+; CHECK-NEXT:    scvtf s0, x10
+; CHECK-NEXT:    mov x10, v3.d[1]
+; CHECK-NEXT:    scvtf s4, x8
+; CHECK-NEXT:    scvtf s5, x9
+; CHECK-NEXT:    scvtf s2, x11
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    scvtf s1, x9
+; CHECK-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-NEXT:    scvtf s3, x11
+; CHECK-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-NEXT:    scvtf s4, x8
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    scvtf s1, x10
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    mov v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
@@ -177,14 +192,29 @@ define <8 x float> @uitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @uitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: uitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v4.2d, v1.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v1.2s, v2.2d
-; CHECK-NEXT:    ucvtf v2.2d, v3.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    mov x9, v2.d[1]
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x11, d2
+; CHECK-NEXT:    ucvtf s0, x10
+; CHECK-NEXT:    mov x10, v3.d[1]
+; CHECK-NEXT:    ucvtf s4, x8
+; CHECK-NEXT:    ucvtf s5, x9
+; CHECK-NEXT:    ucvtf s2, x11
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    fmov x11, d3
+; CHECK-NEXT:    mov x8, v1.d[1]
+; CHECK-NEXT:    ucvtf s1, x9
+; CHECK-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-NEXT:    ucvtf s3, x11
+; CHECK-NEXT:    mov v2.s[1], v5.s[0]
+; CHECK-NEXT:    ucvtf s4, x8
+; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    ucvtf s1, x10
+; CHECK-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NEXT:    mov v0.s[3], v4.s[0]
+; CHECK-NEXT:    mov v2.s[3], v1.s[0]
+; CHECK-NEXT:    mov v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1



More information about the llvm-commits mailing list