[llvm] [AArch64] Don't try to vectorize fixed point to fp narrowing conversion (PR #130665)
Pranav Kant via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 22:12:53 PDT 2025
https://github.com/pranavk updated https://github.com/llvm/llvm-project/pull/130665
>From a25d549726b5dcb77a5964e6f48feefd704b4b6a Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Mon, 10 Mar 2025 19:52:00 +0000
Subject: [PATCH 1/5] [AArch64] Don't try to vectorize fixed point to fp
narrowing conversion
---
.../lib/Target/AArch64/AArch64ISelLowering.cpp | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9511206c0660a..8ba763df1360b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5095,19 +5095,11 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
- MVT CastVT =
- MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
- InVT.getVectorNumElements());
- if (IsStrict) {
- In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
- {Op.getOperand(0), In});
- return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
- {In.getValue(1), In.getValue(0),
- DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
- }
- In = DAG.getNode(Opc, dl, CastVT, In);
- return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
- DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+ // Due to the absence of any vector instructions to directly convert
+ // larger fixed point to lower floating point, we end up using intermediate
+ // representation before finally getting VTSize-d floating point. This extra
+ // rounding can lead to subtly incorrect results.
+ return SDValue();
}
if (VTSize > InVTSize) {
>From 2f5f5569a04602fda37c45db36acc298f22b600a Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Mon, 10 Mar 2025 21:00:03 +0000
Subject: [PATCH 2/5] test case
---
.../AArch64/aarch64-int-to-fp-vectorize.ll | 89 +++++++++++++++++++
1 file changed, 89 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-int-to-fp-vectorize.ll
diff --git a/llvm/test/CodeGen/AArch64/aarch64-int-to-fp-vectorize.ll b/llvm/test/CodeGen/AArch64/aarch64-int-to-fp-vectorize.ll
new file mode 100644
index 0000000000000..c5c7c1d4658f4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-int-to-fp-vectorize.ll
@@ -0,0 +1,89 @@
+; RUN: llc -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-none-linux-gnu"
+
+module asm ".globl _ZSt21ios_base_library_initv"
+
+ at llvm.compiler.used = appending global [1 x ptr] [ptr @_Z9BatchCastPKlPfi], section "llvm.metadata"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @_Z9BatchCastPKlPfi(ptr noalias noundef readonly captures(none) %input, ptr noalias noundef writeonly captures(none) %output, i32 noundef %n) #0 {
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ %min.iters.check = icmp ult i64 %wide.trip.count, 4
+ br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph: ; preds = %for.body.preheader
+ %n.mod.vf = urem i64 %wide.trip.count, 4
+ %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = add i64 %index, 0
+ %1 = getelementptr inbounds nuw i64, ptr %input, i64 %0
+ %2 = getelementptr inbounds nuw i64, ptr %1, i32 0
+ %wide.load = load <4 x i64>, ptr %2, align 8, !tbaa !6
+ ; CHECK-NOT: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %3 = sitofp <4 x i64> %wide.load to <4 x float>
+ %4 = getelementptr inbounds nuw float, ptr %output, i64 %0
+ %5 = getelementptr inbounds nuw float, ptr %4, i32 0
+ store <4 x float> %3, ptr %5, align 4, !tbaa !10
+ %index.next = add nuw i64 %index, 4
+ %6 = icmp eq i64 %index.next, %n.vec
+ br i1 %6, label %middle.block, label %vector.body, !llvm.loop !12
+
+middle.block: ; preds = %vector.body
+ %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+ br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
+
+scalar.ph: ; preds = %for.body.preheader, %middle.block
+ %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %scalar.ph, %for.body
+ %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw i64, ptr %input, i64 %indvars.iv
+ %7 = load i64, ptr %arrayidx, align 8, !tbaa !6
+ %conv = sitofp i64 %7 to float
+ %arrayidx2 = getelementptr inbounds nuw float, ptr %output, i64 %indvars.iv
+ store float %conv, ptr %arrayidx2, align 4, !tbaa !10
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !17
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 21.0.0git (git at github.com:llvm/llvm-project.git 46236f4c3dbe11e14fe7ac1f4b903637efedfecf)"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"long", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"float", !8, i64 0}
+!12 = distinct !{!12, !13, !14, !15, !16}
+!13 = !{!"llvm.loop.mustprogress"}
+!14 = !{!"llvm.loop.unroll.disable"}
+!15 = !{!"llvm.loop.isvectorized", i32 1}
+!16 = !{!"llvm.loop.unroll.runtime.disable"}
+!17 = distinct !{!17, !13, !14, !15}
>From 61dc589090608954a34471cae4c092082a67c2e0 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Mon, 17 Mar 2025 23:52:44 +0000
Subject: [PATCH 3/5] only for f32
---
.../Target/AArch64/AArch64ISelLowering.cpp | 27 +++++++++++++++----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8ba763df1360b..b9a97d381f1b8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5095,11 +5095,28 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
- // Due to the absence of any vector instructions to directly convert
- // larger fixed point to lower floating point, we end up using intermediate
- // representation before finally getting VTSize-d floating point. This extra
- // rounding can lead to subtly incorrect results.
- return SDValue();
+ if (VT.isVector() && VT.getVectorElementType().getFixedSizeInBits() == 32) {
+ // Due to the absence of any vector instructions to directly convert
+ // larger fixed point to lower floating point, we end up using intermediate
+ // representation before finally getting VTSize-d floating point. This extra
+ // rounding can lead to subtly incorrect results. For now, we mark all such
+ // conversions to f32 as illegal.
+ return SDValue();
+ }
+
+ MVT CastVT =
+ MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+ InVT.getVectorNumElements());
+ if (IsStrict) {
+ In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
+ {Op.getOperand(0), In});
+ return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+ {In.getValue(1), In.getValue(0),
+ DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
+ }
+ In = DAG.getNode(Opc, dl, CastVT, In);
+ return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
+ DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
}
if (VTSize > InVTSize) {
>From f3165805e0887424a68b4cea733db4f29c1d5d1d Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Tue, 18 Mar 2025 00:06:29 +0000
Subject: [PATCH 4/5] modify all tests cases
---
.../aarch64-neon-vector-insert-uaddlv.ll | 81 +-
.../CodeGen/AArch64/arm64-convert-v4f64.ll | 74 +-
.../CodeGen/AArch64/bf16-v4-instructions.ll | 122 +-
.../CodeGen/AArch64/bf16-v8-instructions.ll | 261 ++-
.../test/CodeGen/AArch64/complex-int-to-fp.ll | 174 +-
.../fold-int-pow2-with-fmul-or-fdiv.ll | 19 +-
.../CodeGen/AArch64/fp-intrinsics-vector.ll | 51 +-
.../CodeGen/AArch64/fp16-v16-instructions.ll | 201 +-
.../CodeGen/AArch64/fp16-v8-instructions.ll | 103 +-
llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll | 49 +-
llvm/test/CodeGen/AArch64/itofp-bf16.ll | 1239 ++++++++----
llvm/test/CodeGen/AArch64/itofp.ll | 1759 ++++++++++++++---
.../AArch64/sve-fixed-length-int-to-fp.ll | 53 +-
llvm/test/CodeGen/AArch64/vector-fcvt.ll | 98 +-
14 files changed, 3328 insertions(+), 956 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 1b7bc128d6332..f9a0ffce75212 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -146,13 +146,28 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0x000000ffffffff
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
-; CHECK-NEXT: movi.2d v1, #0000000000000000
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: str d1, [x0, #16]
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ushr.2d v1, v0, #32
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: mov.d x8, v1[1]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: dup.2s v3, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov.s v2[1], v1[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul.2s v2, v2, v3
+; CHECK-NEXT: mov.s v1[1], v0[0]
+; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: fadd.2s v1, v2, v1
+; CHECK-NEXT: str d0, [x0, #16]
+; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret
entry:
@@ -167,10 +182,25 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0x000000ffffffff
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: ushr.2d v1, v0, #32
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: mov.d x8, v1[1]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: dup.2s v3, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov.s v2[1], v1[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul.2s v2, v2, v3
+; CHECK-NEXT: mov.s v1[1], v0[0]
+; CHECK-NEXT: fadd.2s v0, v2, v1
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
@@ -186,11 +216,26 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v5i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
+; CHECK-NEXT: movi.2d v2, #0x000000ffffffff
; CHECK-NEXT: str wzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: ushr.2d v1, v0, #32
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: mov.d x8, v1[1]
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: dup.2s v3, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov.s v2[1], v1[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul.2s v2, v2, v3
+; CHECK-NEXT: mov.s v1[1], v0[0]
+; CHECK-NEXT: fadd.2s v0, v2, v1
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
@@ -251,12 +296,20 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
; CHECK-LABEL: insert_vec_v16i64_uaddlv_from_v4i16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: movi.2d v2, #0000000000000000
-; CHECK-NEXT: uaddlv.4h s1, v0
+; CHECK-NEXT: movi.2d v1, #0000000000000000
+; CHECK-NEXT: mov w9, #1333788672 ; =0x4f800000
+; CHECK-NEXT: scvtf s3, xzr
+; CHECK-NEXT: dup.2s v4, w9
+; CHECK-NEXT: uaddlv.4h s2, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
-; CHECK-NEXT: mov.s v2[0], v1[0]
-; CHECK-NEXT: ucvtf.2d v1, v2
-; CHECK-NEXT: fcvtn v1.2s, v1.2d
+; CHECK-NEXT: mov.s v1[0], v2[0]
+; CHECK-NEXT: mov.d x8, v1[1]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.s v1[1], v2[0]
+; CHECK-NEXT: fmul.2s v2, v4, v3[0]
+; CHECK-NEXT: fadd.2s v1, v2, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 508f68d6f14d4..01e6cb8f6439d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -38,11 +38,38 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q1, [x0]
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v2.2d, v1.2d, #32
+; CHECK-NEXT: ushr v5.2d, v3.2d, #32
+; CHECK-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov x9, v5.d[1]
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: fmov x8, d5
+; CHECK-NEXT: mov v4.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov v3.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov v4.s[2], v2.s[0]
+; CHECK-NEXT: mov v3.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v4.s[3], v0.s[0]
+; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-NEXT: fmul v0.4s, v4.4s, v0.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: ret
%tmp1 = load <4 x i64>, ptr %ptr
@@ -53,13 +80,40 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: ldp q1, q3, [x0]
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v2.2d, v1.2d, #32
+; CHECK-NEXT: ushr v5.2d, v3.2d, #32
+; CHECK-NEXT: and v1.16b, v1.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov x9, v5.d[1]
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: fmov x8, d5
+; CHECK-NEXT: mov v4.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov v3.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov v4.s[2], v2.s[0]
; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: mov v3.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v4.s[3], v0.s[0]
+; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: mov v3.s[3], v1.s[0]
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: fmul v0.4s, v4.4s, v0.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s
; CHECK-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
index 1cd0294b0083e..42da624a5f068 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
; CHECK-CVT-LABEL: sitofp_i64:
; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT: movi v1.4s, #1
-; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
-; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
-; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT: mov x8, v0.d[1]
+; CHECK-CVT-NEXT: fmov x9, d0
+; CHECK-CVT-NEXT: scvtf s2, x9
+; CHECK-CVT-NEXT: mov x9, v1.d[1]
+; CHECK-CVT-NEXT: scvtf s0, x8
+; CHECK-CVT-NEXT: fmov x8, d1
+; CHECK-CVT-NEXT: scvtf s1, x8
+; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT: scvtf s0, x9
+; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT: movi v0.4s, #1
+; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b
; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-CVT-NEXT: ret
;
; CHECK-BF16-LABEL: sitofp_i64:
; CHECK-BF16: // %bb.0:
-; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT: mov x8, v0.d[1]
+; CHECK-BF16-NEXT: fmov x9, d0
+; CHECK-BF16-NEXT: scvtf s2, x9
+; CHECK-BF16-NEXT: mov x9, v1.d[1]
+; CHECK-BF16-NEXT: scvtf s0, x8
+; CHECK-BF16-NEXT: fmov x8, d1
+; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x8
+; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x9
+; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
; CHECK-BF16-NEXT: ret
%1 = sitofp <4 x i64> %a to <4 x bfloat>
ret <4 x bfloat> %1
@@ -413,12 +427,39 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
; CHECK-CVT-LABEL: uitofp_i64:
; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
-; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-CVT-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-CVT-NEXT: ushr v4.2d, v1.2d, #32
+; CHECK-CVT-NEXT: mov x8, v3.d[1]
+; CHECK-CVT-NEXT: fmov x10, d3
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-CVT-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT: scvtf s3, x10
+; CHECK-CVT-NEXT: scvtf s5, x8
+; CHECK-CVT-NEXT: fmov x8, d0
+; CHECK-CVT-NEXT: mov x9, v0.d[1]
+; CHECK-CVT-NEXT: scvtf s2, x8
+; CHECK-CVT-NEXT: fmov x8, d4
+; CHECK-CVT-NEXT: scvtf s0, x9
+; CHECK-CVT-NEXT: mov x9, v4.d[1]
+; CHECK-CVT-NEXT: mov v3.s[1], v5.s[0]
+; CHECK-CVT-NEXT: scvtf s4, x8
+; CHECK-CVT-NEXT: fmov x8, d1
+; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT: scvtf s0, x8
+; CHECK-CVT-NEXT: mov x8, v1.d[1]
+; CHECK-CVT-NEXT: scvtf s1, x9
+; CHECK-CVT-NEXT: mov v3.s[2], v4.s[0]
+; CHECK-CVT-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-CVT-NEXT: scvtf s0, x8
+; CHECK-CVT-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-CVT-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-CVT-NEXT: dup v1.4s, w8
+; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-CVT-NEXT: fmul v0.4s, v3.4s, v1.4s
; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
@@ -431,10 +472,37 @@ define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
;
; CHECK-BF16-LABEL: uitofp_i64:
; CHECK-BF16: // %bb.0:
-; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-BF16-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-BF16-NEXT: ushr v4.2d, v1.2d, #32
+; CHECK-BF16-NEXT: mov x8, v3.d[1]
+; CHECK-BF16-NEXT: fmov x10, d3
+; CHECK-BF16-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-BF16-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT: scvtf s3, x10
+; CHECK-BF16-NEXT: scvtf s5, x8
+; CHECK-BF16-NEXT: fmov x8, d0
+; CHECK-BF16-NEXT: mov x9, v0.d[1]
+; CHECK-BF16-NEXT: scvtf s2, x8
+; CHECK-BF16-NEXT: fmov x8, d4
+; CHECK-BF16-NEXT: scvtf s0, x9
+; CHECK-BF16-NEXT: mov x9, v4.d[1]
+; CHECK-BF16-NEXT: mov v3.s[1], v5.s[0]
+; CHECK-BF16-NEXT: scvtf s4, x8
+; CHECK-BF16-NEXT: fmov x8, d1
+; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x8
+; CHECK-BF16-NEXT: mov x8, v1.d[1]
+; CHECK-BF16-NEXT: scvtf s1, x9
+; CHECK-BF16-NEXT: mov v3.s[2], v4.s[0]
+; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x8
+; CHECK-BF16-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-BF16-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-BF16-NEXT: dup v1.4s, w8
+; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-BF16-NEXT: fmul v0.4s, v3.4s, v1.4s
+; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
; CHECK-BF16-NEXT: ret
%1 = uitofp <4 x i64> %a to <4 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 2eaa58de92807..e525ada5c9a61 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -489,45 +489,74 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
; CHECK-CVT-LABEL: sitofp_i64:
; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: scvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT: scvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT: movi v1.4s, #1
-; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT: fmov x10, d2
+; CHECK-CVT-NEXT: mov x8, v2.d[1]
+; CHECK-CVT-NEXT: mov x9, v0.d[1]
+; CHECK-CVT-NEXT: scvtf s2, x10
+; CHECK-CVT-NEXT: fmov x10, d0
+; CHECK-CVT-NEXT: scvtf s0, x8
+; CHECK-CVT-NEXT: scvtf s5, x9
+; CHECK-CVT-NEXT: fmov x9, d3
+; CHECK-CVT-NEXT: mov x8, v3.d[1]
+; CHECK-CVT-NEXT: scvtf s4, x10
+; CHECK-CVT-NEXT: fmov x10, d1
+; CHECK-CVT-NEXT: scvtf s3, x9
+; CHECK-CVT-NEXT: mov x9, v1.d[1]
+; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-CVT-NEXT: scvtf s0, x10
+; CHECK-CVT-NEXT: scvtf s1, x8
+; CHECK-CVT-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-CVT-NEXT: scvtf s3, x9
+; CHECK-CVT-NEXT: mov v4.s[2], v0.s[0]
+; CHECK-CVT-NEXT: movi v0.4s, #1
+; CHECK-CVT-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT: mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
+; CHECK-CVT-NEXT: add v6.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT: ushr v5.4s, v4.4s, #16
+; CHECK-CVT-NEXT: add v1.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT: and v3.16b, v3.16b, v0.16b
+; CHECK-CVT-NEXT: and v0.16b, v5.16b, v0.16b
; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s
; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: add v3.4s, v3.4s, v6.4s
+; CHECK-CVT-NEXT: fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-CVT-NEXT: orr v4.4s, #64, lsl #16
+; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT: mov v1.16b, v5.16b
+; CHECK-CVT-NEXT: bif v0.16b, v4.16b, v6.16b
+; CHECK-CVT-NEXT: bsl v1.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; CHECK-CVT-NEXT: ret
;
; CHECK-BF16-LABEL: sitofp_i64:
; CHECK-BF16: // %bb.0:
-; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT: scvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT: scvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT: mov x9, v0.d[1]
+; CHECK-BF16-NEXT: fmov x10, d0
+; CHECK-BF16-NEXT: mov x8, v2.d[1]
+; CHECK-BF16-NEXT: scvtf s4, x10
+; CHECK-BF16-NEXT: fmov x10, d1
+; CHECK-BF16-NEXT: scvtf s0, x9
+; CHECK-BF16-NEXT: fmov x9, d2
+; CHECK-BF16-NEXT: scvtf s2, x8
+; CHECK-BF16-NEXT: mov x8, v1.d[1]
+; CHECK-BF16-NEXT: scvtf s1, x9
+; CHECK-BF16-NEXT: fmov x9, d3
+; CHECK-BF16-NEXT: mov v4.s[1], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x10
+; CHECK-BF16-NEXT: mov x10, v3.d[1]
+; CHECK-BF16-NEXT: scvtf s3, x9
+; CHECK-BF16-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-BF16-NEXT: scvtf s2, x8
+; CHECK-BF16-NEXT: mov v4.s[2], v0.s[0]
+; CHECK-BF16-NEXT: scvtf s0, x10
+; CHECK-BF16-NEXT: mov v1.s[2], v3.s[0]
+; CHECK-BF16-NEXT: mov v4.s[3], v2.s[0]
+; CHECK-BF16-NEXT: mov v1.s[3], v0.s[0]
+; CHECK-BF16-NEXT: bfcvtn v0.4h, v4.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
; CHECK-BF16-NEXT: ret
%1 = sitofp <8 x i64> %a to <8 x bfloat>
ret <8 x bfloat> %1
@@ -712,45 +741,147 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
; CHECK-CVT-LABEL: uitofp_i64:
; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-CVT-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-CVT-NEXT: movi v1.4s, #1
+; CHECK-CVT-NEXT: ushr v5.2d, v2.2d, #32
+; CHECK-CVT-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-CVT-NEXT: ushr v6.2d, v0.2d, #32
+; CHECK-CVT-NEXT: ushr v7.2d, v3.2d, #32
+; CHECK-CVT-NEXT: ushr v16.2d, v1.2d, #32
+; CHECK-CVT-NEXT: fmov x10, d5
+; CHECK-CVT-NEXT: mov x8, v5.d[1]
+; CHECK-CVT-NEXT: mov x9, v6.d[1]
+; CHECK-CVT-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-CVT-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-CVT-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-CVT-NEXT: fmov x12, d7
+; CHECK-CVT-NEXT: mov x11, v7.d[1]
+; CHECK-CVT-NEXT: scvtf s4, x10
+; CHECK-CVT-NEXT: fmov x10, d6
+; CHECK-CVT-NEXT: mov x13, v2.d[1]
+; CHECK-CVT-NEXT: scvtf s5, x8
+; CHECK-CVT-NEXT: mov x8, v0.d[1]
+; CHECK-CVT-NEXT: scvtf s7, x9
+; CHECK-CVT-NEXT: scvtf s17, x12
+; CHECK-CVT-NEXT: fmov x12, d16
+; CHECK-CVT-NEXT: scvtf s6, x10
+; CHECK-CVT-NEXT: fmov x10, d2
+; CHECK-CVT-NEXT: mov x9, v16.d[1]
+; CHECK-CVT-NEXT: scvtf s16, x13
+; CHECK-CVT-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-CVT-NEXT: fmov x13, d1
+; CHECK-CVT-NEXT: scvtf s2, x10
+; CHECK-CVT-NEXT: fmov x10, d0
+; CHECK-CVT-NEXT: scvtf s0, x12
+; CHECK-CVT-NEXT: mov v6.s[1], v7.s[0]
+; CHECK-CVT-NEXT: scvtf s7, x8
+; CHECK-CVT-NEXT: mov x8, v3.d[1]
+; CHECK-CVT-NEXT: mov x12, v1.d[1]
+; CHECK-CVT-NEXT: scvtf s5, x10
+; CHECK-CVT-NEXT: fmov x10, d3
+; CHECK-CVT-NEXT: scvtf s3, x11
+; CHECK-CVT-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-CVT-NEXT: mov v4.s[2], v17.s[0]
+; CHECK-CVT-NEXT: scvtf s16, x13
+; CHECK-CVT-NEXT: mov v6.s[2], v0.s[0]
+; CHECK-CVT-NEXT: scvtf s0, x9
+; CHECK-CVT-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-CVT-NEXT: scvtf s1, x10
+; CHECK-CVT-NEXT: mov v5.s[1], v7.s[0]
+; CHECK-CVT-NEXT: dup v7.4s, w9
+; CHECK-CVT-NEXT: mov v4.s[3], v3.s[0]
+; CHECK-CVT-NEXT: scvtf s3, x12
+; CHECK-CVT-NEXT: mov v6.s[3], v0.s[0]
+; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-CVT-NEXT: scvtf s1, x8
+; CHECK-CVT-NEXT: mov v5.s[2], v16.s[0]
+; CHECK-CVT-NEXT: fmul v0.4s, v4.4s, v7.4s
+; CHECK-CVT-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-CVT-NEXT: fmul v1.4s, v6.4s, v7.4s
+; CHECK-CVT-NEXT: mov v5.s[3], v3.s[0]
; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8
-; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16
-; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16
-; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s
-; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b
-; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
-; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: movi v2.4s, #1
+; CHECK-CVT-NEXT: fadd v1.4s, v1.4s, v5.4s
+; CHECK-CVT-NEXT: ushr v4.4s, v0.4s, #16
+; CHECK-CVT-NEXT: add v6.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT: ushr v5.4s, v1.4s, #16
+; CHECK-CVT-NEXT: add v3.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT: fcmeq v7.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT: and v4.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT: fcmeq v5.4s, v0.4s, v0.4s
; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b
-; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b
-; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT: bit v0.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT: bit v1.16b, v2.16b, v7.16b
+; CHECK-CVT-NEXT: uzp2 v0.8h, v1.8h, v0.8h
; CHECK-CVT-NEXT: ret
;
; CHECK-BF16-LABEL: uitofp_i64:
; CHECK-BF16: // %bb.0:
-; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-BF16-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-BF16-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-BF16-NEXT: ushr v6.2d, v2.2d, #32
+; CHECK-BF16-NEXT: ushr v5.2d, v0.2d, #32
+; CHECK-BF16-NEXT: ushr v7.2d, v1.2d, #32
+; CHECK-BF16-NEXT: mov x9, v6.d[1]
+; CHECK-BF16-NEXT: fmov x10, d6
+; CHECK-BF16-NEXT: mov x8, v5.d[1]
+; CHECK-BF16-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-BF16-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-BF16-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-BF16-NEXT: scvtf s6, x10
+; CHECK-BF16-NEXT: fmov x10, d5
+; CHECK-BF16-NEXT: mov x11, v0.d[1]
+; CHECK-BF16-NEXT: scvtf s17, x9
+; CHECK-BF16-NEXT: fmov x9, d0
+; CHECK-BF16-NEXT: scvtf s16, x8
+; CHECK-BF16-NEXT: mov x8, v2.d[1]
+; CHECK-BF16-NEXT: scvtf s5, x10
+; CHECK-BF16-NEXT: mov x10, v7.d[1]
+; CHECK-BF16-NEXT: scvtf s0, x9
+; CHECK-BF16-NEXT: fmov x9, d2
+; CHECK-BF16-NEXT: scvtf s2, x11
+; CHECK-BF16-NEXT: fmov x11, d7
+; CHECK-BF16-NEXT: ushr v7.2d, v3.2d, #32
+; CHECK-BF16-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-BF16-NEXT: mov v6.s[1], v17.s[0]
+; CHECK-BF16-NEXT: mov v5.s[1], v16.s[0]
+; CHECK-BF16-NEXT: scvtf s17, x9
+; CHECK-BF16-NEXT: scvtf s16, x8
+; CHECK-BF16-NEXT: scvtf s4, x11
+; CHECK-BF16-NEXT: fmov x9, d7
+; CHECK-BF16-NEXT: fmov x11, d1
+; CHECK-BF16-NEXT: mov x8, v7.d[1]
+; CHECK-BF16-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-BF16-NEXT: scvtf s7, x9
+; CHECK-BF16-NEXT: scvtf s2, x11
+; CHECK-BF16-NEXT: mov x9, v1.d[1]
+; CHECK-BF16-NEXT: mov v5.s[2], v4.s[0]
+; CHECK-BF16-NEXT: scvtf s1, x10
+; CHECK-BF16-NEXT: fmov x10, d3
+; CHECK-BF16-NEXT: mov x11, v3.d[1]
+; CHECK-BF16-NEXT: mov v17.s[1], v16.s[0]
+; CHECK-BF16-NEXT: scvtf s4, x8
+; CHECK-BF16-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-BF16-NEXT: scvtf s3, x10
+; CHECK-BF16-NEXT: mov v6.s[2], v7.s[0]
+; CHECK-BF16-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-BF16-NEXT: scvtf s2, x9
+; CHECK-BF16-NEXT: mov v5.s[3], v1.s[0]
+; CHECK-BF16-NEXT: dup v1.4s, w8
+; CHECK-BF16-NEXT: mov v17.s[2], v3.s[0]
+; CHECK-BF16-NEXT: scvtf s3, x11
+; CHECK-BF16-NEXT: mov v6.s[3], v4.s[0]
+; CHECK-BF16-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-BF16-NEXT: fmul v2.4s, v5.4s, v1.4s
+; CHECK-BF16-NEXT: mov v17.s[3], v3.s[0]
+; CHECK-BF16-NEXT: fmul v1.4s, v6.4s, v1.4s
+; CHECK-BF16-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-BF16-NEXT: fadd v1.4s, v1.4s, v17.4s
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
-; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s
+; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s
; CHECK-BF16-NEXT: ret
%1 = uitofp <8 x i64> %a to <8 x bfloat>
ret <8 x bfloat> %1
diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
index 506e5e59a3529..1fb4a14806c41 100644
--- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -1,9 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
; CHECK: autogen_SD19655
; CHECK: scvtf
; CHECK: ret
define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
+; CHECK-LABEL: autogen_SD19655:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov.s v1[1], v0[0]
+; CHECK-NEXT: str d1, [x1]
+; CHECK-NEXT: ret
%T = load <2 x i64>, ptr %addr
%F = sitofp <2 x i64> %T to <2 x float>
store <2 x float> %F, ptr %addrfloat
@@ -12,38 +23,44 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
define <2 x double> @test_signed_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i32_to_v2f64:
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], v0, #0
-; CHECK-NEXT: scvtf.2d v0, [[VAL64]]
-; CHECK-NEXT: ret
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i32> %v to <2 x double>
ret <2 x double> %conv
}
define <2 x double> @test_unsigned_v2i32_to_v2f64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i32_to_v2f64
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], v0, #0
-; CHECK-NEXT: ucvtf.2d v0, [[VAL64]]
-; CHECK-NEXT: ret
+; CHECK-LABEL: test_unsigned_v2i32_to_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <2 x i32> %v to <2 x double>
ret <2 x double> %conv
}
define <2 x double> @test_signed_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i16_to_v2f64:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: scvtf.2d v0, [[VAL64]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl.2s v0, v0, #16
+; CHECK-NEXT: sshr.2s v0, v0, #16
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i16> %v to <2 x double>
ret <2 x double> %conv
}
define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i16_to_v2f64
-; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: ucvtf.2d v0, [[VAL64]]
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0x00ffff0000ffff
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <2 x i16> %v to <2 x double>
ret <2 x double> %conv
@@ -51,20 +68,24 @@ define <2 x double> @test_unsigned_v2i16_to_v2f64(<2 x i16> %v) nounwind readnon
define <2 x double> @test_signed_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i8_to_v2f64:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
-; CHECK: sshll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: scvtf.2d v0, [[VAL64]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl.2s v0, v0, #24
+; CHECK-NEXT: sshr.2s v0, v0, #24
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i8> %v to <2 x double>
ret <2 x double> %conv
}
define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i8_to_v2f64
-; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ushll.2d [[VAL64:v[0-9]+]], [[VAL32]], #0
-; CHECK: ucvtf.2d v0, [[VAL64]]
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0x0000ff000000ff
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <2 x i8> %v to <2 x double>
ret <2 x double> %conv
@@ -72,16 +93,39 @@ define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone
define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i64_to_v2f32:
-; CHECK: scvtf.2d [[VAL64:v[0-9]+]], v0
-; CHECK: fcvtn v0.2s, [[VAL64]].2d
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.s v0[1], v1[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i64> %v to <2 x float>
ret <2 x float> %conv
}
define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i64_to_v2f32
-; CHECK: ucvtf.2d [[VAL64:v[0-9]+]], v0
-; CHECK: fcvtn v0.2s, [[VAL64]].2d
+; CHECK-LABEL: test_unsigned_v2i64_to_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.2d v1, #0x000000ffffffff
+; CHECK-NEXT: ushr.2d v2, v0, #32
+; CHECK-NEXT: mov.d x8, v2[1]
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov.d x8, v0[1]
+; CHECK-NEXT: dup.2s v3, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov.s v2[1], v1[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul.2s v2, v2, v3
+; CHECK-NEXT: mov.s v1[1], v0[0]
+; CHECK-NEXT: fadd.2s v0, v2, v1
+; CHECK-NEXT: ret
%conv = uitofp <2 x i64> %v to <2 x float>
ret <2 x float> %conv
@@ -89,18 +133,22 @@ define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone
define <2 x float> @test_signed_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i16_to_v2f32:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #16
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #16
-; CHECK: scvtf.2s v0, [[VAL32]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl.2s v0, v0, #16
+; CHECK-NEXT: sshr.2s v0, v0, #16
+; CHECK-NEXT: scvtf.2s v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i16> %v to <2 x float>
ret <2 x float> %conv
}
define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i16_to_v2f32
-; CHECK: movi d[[MASK:[0-9]+]], #0x00ffff0000ffff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ucvtf.2s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v2i16_to_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0x00ffff0000ffff
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: ucvtf.2s v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <2 x i16> %v to <2 x float>
ret <2 x float> %conv
@@ -108,18 +156,22 @@ define <2 x float> @test_unsigned_v2i16_to_v2f32(<2 x i16> %v) nounwind readnone
define <2 x float> @test_signed_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v2i8_to_v2f32:
-; CHECK: shl.2s [[TMP:v[0-9]+]], v0, #24
-; CHECK: sshr.2s [[VAL32:v[0-9]+]], [[TMP]], #24
-; CHECK: scvtf.2s v0, [[VAL32]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl.2s v0, v0, #24
+; CHECK-NEXT: sshr.2s v0, v0, #24
+; CHECK-NEXT: scvtf.2s v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <2 x i8> %v to <2 x float>
ret <2 x float> %conv
}
define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v2i8_to_v2f32
-; CHECK: movi d[[MASK:[0-9]+]], #0x0000ff000000ff
-; CHECK: and.8b [[VAL32:v[0-9]+]], v0, v[[MASK]]
-; CHECK: ucvtf.2s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v2i8_to_v2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0x0000ff000000ff
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: ucvtf.2s v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <2 x i8> %v to <2 x float>
ret <2 x float> %conv
@@ -127,17 +179,21 @@ define <2 x float> @test_unsigned_v2i8_to_v2f32(<2 x i8> %v) nounwind readnone {
define <4 x float> @test_signed_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v4i16_to_v4f32:
-; CHECK: sshll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: scvtf.4s v0, [[VAL32]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: scvtf.4s v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <4 x i16> %v to <4 x float>
ret <4 x float> %conv
}
define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v4i16_to_v4f32
-; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: ucvtf.4s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v4i16_to_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ucvtf.4s v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <4 x i16> %v to <4 x float>
ret <4 x float> %conv
@@ -145,19 +201,23 @@ define <4 x float> @test_unsigned_v4i16_to_v4f32(<4 x i16> %v) nounwind readnone
define <4 x float> @test_signed_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
; CHECK-LABEL: test_signed_v4i8_to_v4f32:
-; CHECK: shl.4h [[TMP:v[0-9]+]], v0, #8
-; CHECK: sshr.4h [[VAL16:v[0-9]+]], [[TMP]], #8
-; CHECK: sshll.4s [[VAL32:v[0-9]+]], [[VAL16]], #0
-; CHECK: scvtf.4s v0, [[VAL32]]
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl.4h v0, v0, #8
+; CHECK-NEXT: sshr.4h v0, v0, #8
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: scvtf.4s v0, v0
+; CHECK-NEXT: ret
%conv = sitofp <4 x i8> %v to <4 x float>
ret <4 x float> %conv
}
define <4 x float> @test_unsigned_v4i8_to_v4f32(<4 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_unsigned_v4i8_to_v4f32
-; CHECK: bic.4h v0, #255, lsl #8
-; CHECK: ushll.4s [[VAL32:v[0-9]+]], v0, #0
-; CHECK: ucvtf.4s v0, [[VAL32]]
+; CHECK-LABEL: test_unsigned_v4i8_to_v4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bic.4h v0, #255, lsl #8
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ucvtf.4s v0, v0
+; CHECK-NEXT: ret
%conv = uitofp <4 x i8> %v to <4 x float>
ret <4 x float> %conv
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index b40c0656a60e4..cef84c370b290 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -260,11 +260,26 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
; CHECK-NEON-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; CHECK-NEON: // %bb.0:
; CHECK-NEON-NEXT: mov w8, #2 // =0x2
+; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff
; CHECK-NEON-NEXT: dup v1.2d, x8
; CHECK-NEON-NEXT: ushl v0.2d, v1.2d, v0.2d
+; CHECK-NEON-NEXT: ushr v1.2d, v0.2d, #32
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: mov x8, v1.d[1]
+; CHECK-NEON-NEXT: fmov x9, d1
+; CHECK-NEON-NEXT: scvtf s2, x9
+; CHECK-NEON-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEON-NEXT: scvtf s1, x8
+; CHECK-NEON-NEXT: mov x8, v0.d[1]
+; CHECK-NEON-NEXT: dup v3.2s, w9
+; CHECK-NEON-NEXT: fmov x9, d0
+; CHECK-NEON-NEXT: scvtf s0, x8
+; CHECK-NEON-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEON-NEXT: scvtf s1, x9
+; CHECK-NEON-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEON-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NEON-NEXT: fadd v0.2s, v2.2s, v1.2s
; CHECK-NEON-NEXT: fmov v1.2s, #15.00000000
-; CHECK-NEON-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEON-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEON-NEXT: fmul v0.2s, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index 83e60c1089762..1364c47adff2d 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -193,10 +193,17 @@ define <4 x float> @uitofp_v4f32_v4i32(<4 x i32> %x) #0 {
define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: sitofp_v4f32_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
%val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <4 x float> %val
@@ -205,10 +212,38 @@ define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
define <4 x float> @uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: uitofp_v4f32_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v3.2d, v1.2d, #32
+; CHECK-NEXT: ushr v4.2d, v0.2d, #32
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov x9, v4.d[1]
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: fmov x11, d4
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: mov x10, v1.d[1]
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: scvtf s4, x11
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: fmov x11, d1
+; CHECK-NEXT: dup v1.2s, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: mov v2.s[1], v3.s[0]
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: mov v6.s[1], v0.s[0]
+; CHECK-NEXT: fmul v0.2s, v2.2s, v1.2s
+; CHECK-NEXT: fmul v1.2s, v4.2s, v1.2s
+; CHECK-NEXT: mov v5.s[1], v3.s[0]
+; CHECK-NEXT: fadd v2.2s, v0.2s, v6.2s
+; CHECK-NEXT: fadd v0.2s, v1.2s, v5.2s
+; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: ret
%val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
ret <4 x float> %val
diff --git a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
index 4c112cf89aec1..d48381600a268 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
@@ -23,26 +23,54 @@ define <16 x half> @sitofp_i32(<16 x i32> %a) #0 {
define <16 x half> @sitofp_i64(<16 x i64> %a) #0 {
; CHECK-LABEL: sitofp_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v4.2d, v4.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: scvtf v6.2d, v6.2d
-; CHECK-NEXT: scvtf v5.2d, v5.2d
-; CHECK-NEXT: scvtf v3.2d, v3.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: scvtf v1.2d, v7.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: mov x9, v4.d[1]
+; CHECK-NEXT: mov x10, v2.d[1]
+; CHECK-NEXT: fmov x11, d2
+; CHECK-NEXT: mov x13, v1.d[1]
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: fmov x12, d4
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: mov x8, v6.d[1]
+; CHECK-NEXT: scvtf s16, x9
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: fmov x10, d5
+; CHECK-NEXT: scvtf s4, x12
+; CHECK-NEXT: fmov x12, d6
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: mov x11, v5.d[1]
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: scvtf s17, x12
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: fmov x12, d7
+; CHECK-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: mov v6.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s7, x11
+; CHECK-NEXT: scvtf s3, x9
+; CHECK-NEXT: scvtf s1, x12
+; CHECK-NEXT: mov v0.s[2], v5.s[0]
+; CHECK-NEXT: scvtf s5, x13
+; CHECK-NEXT: mov v17.s[1], v16.s[0]
+; CHECK-NEXT: mov v4.s[2], v2.s[0]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: mov v6.s[2], v3.s[0]
+; CHECK-NEXT: mov v0.s[3], v5.s[0]
+; CHECK-NEXT: mov v17.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov v4.s[3], v7.s[0]
+; CHECK-NEXT: mov v6.s[3], v2.s[0]
; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: fcvtn2 v6.4s, v1.2d
+; CHECK-NEXT: mov v17.s[3], v1.s[0]
; CHECK-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT: fcvtn2 v1.8h, v6.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v6.4s
+; CHECK-NEXT: fcvtn2 v1.8h, v17.4s
; CHECK-NEXT: ret
@@ -74,26 +102,125 @@ define <16 x half> @uitofp_i32(<16 x i32> %a) #0 {
define <16 x half> @uitofp_i64(<16 x i64> %a) #0 {
; CHECK-LABEL: uitofp_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: ucvtf v1.2d, v7.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: fcvtn2 v6.4s, v1.2d
+; CHECK-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v17.2d, v2.2d, #32
+; CHECK-NEXT: ushr v18.2d, v3.2d, #32
+; CHECK-NEXT: ushr v20.2d, v0.2d, #32
+; CHECK-NEXT: ushr v21.2d, v1.2d, #32
+; CHECK-NEXT: ushr v19.2d, v6.2d, #32
+; CHECK-NEXT: mov x8, v17.d[1]
+; CHECK-NEXT: fmov x10, d17
+; CHECK-NEXT: mov x9, v18.d[1]
+; CHECK-NEXT: and v22.16b, v0.16b, v16.16b
+; CHECK-NEXT: and v23.16b, v2.16b, v16.16b
+; CHECK-NEXT: fmov x12, d18
+; CHECK-NEXT: mov x13, v20.d[1]
+; CHECK-NEXT: and v24.16b, v3.16b, v16.16b
+; CHECK-NEXT: and v17.16b, v1.16b, v16.16b
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: fmov x15, d20
+; CHECK-NEXT: mov x11, v21.d[1]
+; CHECK-NEXT: mov x14, v22.d[1]
+; CHECK-NEXT: scvtf s18, x8
+; CHECK-NEXT: fmov x8, d22
+; CHECK-NEXT: mov x10, v23.d[1]
+; CHECK-NEXT: fmov x16, d23
+; CHECK-NEXT: ushr v23.2d, v4.2d, #32
+; CHECK-NEXT: scvtf s3, x15
+; CHECK-NEXT: fmov x15, d21
+; CHECK-NEXT: scvtf s21, x13
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x13, d24
+; CHECK-NEXT: and v6.16b, v6.16b, v16.16b
+; CHECK-NEXT: scvtf s20, x14
+; CHECK-NEXT: mov v0.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s18, x12
+; CHECK-NEXT: fmov x12, d17
+; CHECK-NEXT: mov x14, v19.d[1]
+; CHECK-NEXT: scvtf s1, x16
+; CHECK-NEXT: scvtf s22, x10
+; CHECK-NEXT: mov x10, v17.d[1]
+; CHECK-NEXT: scvtf s17, x13
+; CHECK-NEXT: mov x13, v23.d[1]
+; CHECK-NEXT: mov v3.s[1], v21.s[0]
+; CHECK-NEXT: scvtf s21, x15
+; CHECK-NEXT: mov v2.s[1], v20.s[0]
+; CHECK-NEXT: scvtf s20, x12
+; CHECK-NEXT: fmov x12, d19
+; CHECK-NEXT: mov v0.s[2], v18.s[0]
+; CHECK-NEXT: and v18.16b, v4.16b, v16.16b
+; CHECK-NEXT: scvtf s19, x14
+; CHECK-NEXT: fmov x15, d23
+; CHECK-NEXT: mov v1.s[1], v22.s[0]
+; CHECK-NEXT: fmov x14, d6
+; CHECK-NEXT: scvtf s4, x12
+; CHECK-NEXT: mov x12, v6.d[1]
+; CHECK-NEXT: scvtf s6, x13
+; CHECK-NEXT: ushr v22.2d, v5.2d, #32
+; CHECK-NEXT: fmov x13, d18
+; CHECK-NEXT: mov v2.s[2], v20.s[0]
+; CHECK-NEXT: mov v3.s[2], v21.s[0]
+; CHECK-NEXT: scvtf s20, x11
+; CHECK-NEXT: mov x11, v18.d[1]
+; CHECK-NEXT: scvtf s21, x15
+; CHECK-NEXT: and v5.16b, v5.16b, v16.16b
+; CHECK-NEXT: mov x8, v24.d[1]
+; CHECK-NEXT: mov v4.s[1], v19.s[0]
+; CHECK-NEXT: and v19.16b, v7.16b, v16.16b
+; CHECK-NEXT: scvtf s16, x13
+; CHECK-NEXT: fmov x13, d22
+; CHECK-NEXT: ushr v7.2d, v7.2d, #32
+; CHECK-NEXT: mov v1.s[2], v17.s[0]
+; CHECK-NEXT: scvtf s18, x11
+; CHECK-NEXT: mov x11, v22.d[1]
+; CHECK-NEXT: fmov x15, d5
+; CHECK-NEXT: mov v21.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s22, x12
+; CHECK-NEXT: mov v3.s[3], v20.s[0]
+; CHECK-NEXT: scvtf s6, x13
+; CHECK-NEXT: fmov x13, d7
+; CHECK-NEXT: mov x12, v7.d[1]
+; CHECK-NEXT: scvtf s7, x14
+; CHECK-NEXT: mov x14, v5.d[1]
+; CHECK-NEXT: scvtf s20, x15
+; CHECK-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s18, x9
+; CHECK-NEXT: mov x9, v19.d[1]
+; CHECK-NEXT: scvtf s5, x13
+; CHECK-NEXT: scvtf s17, x8
+; CHECK-NEXT: mov v21.s[2], v6.s[0]
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: fmov x11, d19
+; CHECK-NEXT: scvtf s19, x10
+; CHECK-NEXT: mov w10, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v7.s[1], v22.s[0]
+; CHECK-NEXT: mov v16.s[2], v20.s[0]
+; CHECK-NEXT: dup v20.4s, w10
+; CHECK-NEXT: mov v0.s[3], v18.s[0]
+; CHECK-NEXT: scvtf s22, x11
+; CHECK-NEXT: mov v4.s[2], v5.s[0]
+; CHECK-NEXT: scvtf s5, x12
+; CHECK-NEXT: mov v21.s[3], v6.s[0]
+; CHECK-NEXT: scvtf s6, x14
+; CHECK-NEXT: mov v1.s[3], v17.s[0]
+; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: fmul v3.4s, v3.4s, v20.4s
+; CHECK-NEXT: fmul v0.4s, v0.4s, v20.4s
+; CHECK-NEXT: mov v7.s[2], v22.s[0]
+; CHECK-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: mov v16.s[3], v6.s[0]
+; CHECK-NEXT: fmul v6.4s, v21.4s, v20.4s
+; CHECK-NEXT: fadd v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: mov v7.s[3], v5.s[0]
+; CHECK-NEXT: fmul v3.4s, v4.4s, v20.4s
+; CHECK-NEXT: fadd v5.4s, v0.4s, v1.4s
+; CHECK-NEXT: fadd v4.4s, v6.4s, v16.4s
+; CHECK-NEXT: fcvtn v0.4h, v2.4s
+; CHECK-NEXT: fadd v2.4s, v3.4s, v7.4s
; CHECK-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT: fcvtn2 v1.8h, v6.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v5.4s
+; CHECK-NEXT: fcvtn2 v1.8h, v2.4s
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index d4130e7a848b1..cc9831ecd464d 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -288,16 +288,30 @@ define <8 x half> @sitofp_i32(<8 x i32> %a) #0 {
define <8 x half> @sitofp_i64(<8 x i64> %a) #0 {
; CHECK-LABEL: sitofp_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: scvtf v3.2d, v3.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: mov v4.s[1], v0.s[0]
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: mov x10, v3.d[1]
+; CHECK-NEXT: scvtf s3, x9
+; CHECK-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: mov v4.s[2], v0.s[0]
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: mov v1.s[2], v3.s[0]
+; CHECK-NEXT: mov v4.s[3], v2.s[0]
+; CHECK-NEXT: mov v1.s[3], v0.s[0]
+; CHECK-NEXT: fcvtn v0.4h, v4.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-NEXT: ret
%1 = sitofp <8 x i64> %a to <8 x half>
ret <8 x half> %1
@@ -409,16 +423,67 @@ define <8 x half> @uitofp_i32(<8 x i32> %a) #0 {
define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
; CHECK-LABEL: uitofp_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v6.2d, v2.2d, #32
+; CHECK-NEXT: ushr v5.2d, v0.2d, #32
+; CHECK-NEXT: ushr v7.2d, v1.2d, #32
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: mov x8, v5.d[1]
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: scvtf s6, x10
+; CHECK-NEXT: fmov x10, d5
+; CHECK-NEXT: mov x11, v0.d[1]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: scvtf s5, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: scvtf s2, x11
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: ushr v7.2d, v3.2d, #32
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: mov v6.s[1], v17.s[0]
+; CHECK-NEXT: mov v5.s[1], v16.s[0]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: scvtf s4, x11
+; CHECK-NEXT: fmov x9, d7
+; CHECK-NEXT: fmov x11, d1
+; CHECK-NEXT: mov x8, v7.d[1]
+; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: scvtf s2, x11
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: mov v5.s[2], v4.s[0]
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: mov x11, v3.d[1]
+; CHECK-NEXT: mov v17.s[1], v16.s[0]
+; CHECK-NEXT: scvtf s4, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: scvtf s3, x10
+; CHECK-NEXT: mov v6.s[2], v7.s[0]
+; CHECK-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov v5.s[3], v1.s[0]
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: mov v17.s[2], v3.s[0]
+; CHECK-NEXT: scvtf s3, x11
+; CHECK-NEXT: mov v6.s[3], v4.s[0]
+; CHECK-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-NEXT: fmul v2.4s, v5.4s, v1.4s
+; CHECK-NEXT: mov v17.s[3], v3.s[0]
+; CHECK-NEXT: fmul v1.4s, v6.4s, v1.4s
+; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: fadd v1.4s, v1.4s, v17.4s
; CHECK-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
+; CHECK-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-NEXT: ret
%1 = uitofp <8 x i64> %a to <8 x half>
ret <8 x half> %1
diff --git a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
index 0a7319b9ce11e..27499b8940ff2 100644
--- a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
+++ b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll
@@ -210,15 +210,20 @@ define <1 x float> @scvtf_f32i64_simple(<1 x i64> %x) {
; CHECK-LABEL: scvtf_f32i64_simple:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: scvtf s0, d0
+; CHECK-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ret
;
; CHECK-NO-FPRCVT-LABEL: scvtf_f32i64_simple:
; CHECK-NO-FPRCVT: // %bb.0:
; CHECK-NO-FPRCVT-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT: fmov x8, d0
+; CHECK-NO-FPRCVT-NEXT: movi d1, #0000000000000000
+; CHECK-NO-FPRCVT-NEXT: scvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT: fmov d0, d1
; CHECK-NO-FPRCVT-NEXT: ret
%conv = sitofp <1 x i64> %x to <1 x float>
ret <1 x float> %conv
@@ -426,15 +431,43 @@ define <1 x float> @ucvtf_f32i64_simple(<1 x i64> %x) {
; CHECK-LABEL: ucvtf_f32i64_simple:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v2.2d, v0.2d, #32
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: scvtf s2, d2
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: scvtf s0, d0
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-NEXT: fadd v0.2s, v2.2s, v0.2s
; CHECK-NEXT: ret
;
; CHECK-NO-FPRCVT-LABEL: ucvtf_f32i64_simple:
; CHECK-NO-FPRCVT: // %bb.0:
; CHECK-NO-FPRCVT-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NO-FPRCVT-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NO-FPRCVT-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NO-FPRCVT-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NO-FPRCVT-NEXT: ushr v2.2d, v0.2d, #32
+; CHECK-NO-FPRCVT-NEXT: mov x8, v2.d[1]
+; CHECK-NO-FPRCVT-NEXT: fmov x9, d2
+; CHECK-NO-FPRCVT-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NO-FPRCVT-NEXT: scvtf s2, x9
+; CHECK-NO-FPRCVT-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NO-FPRCVT-NEXT: scvtf s1, x8
+; CHECK-NO-FPRCVT-NEXT: mov x8, v0.d[1]
+; CHECK-NO-FPRCVT-NEXT: dup v3.2s, w9
+; CHECK-NO-FPRCVT-NEXT: fmov x9, d0
+; CHECK-NO-FPRCVT-NEXT: scvtf s0, x8
+; CHECK-NO-FPRCVT-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NO-FPRCVT-NEXT: scvtf s1, x9
+; CHECK-NO-FPRCVT-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-NO-FPRCVT-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NO-FPRCVT-NEXT: fadd v0.2s, v2.2s, v1.2s
; CHECK-NO-FPRCVT-NEXT: ret
%conv = uitofp <1 x i64> %x to <1 x float>
ret <1 x float> %conv
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
index 58591b11c184f..9754a95dbcc16 100644
--- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -349,22 +349,27 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
; CHECK-LABEL: stofp_v3i64_v3bf16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: scvtf v1.2d, v2.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushr v3.4s, v0.4s, #16
-; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: fmov x8, d2
+; CHECK-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov v3.s[1], v1.s[0]
+; CHECK-NEXT: movi v1.4s, #127, msl #8
+; CHECK-NEXT: mov v3.s[2], v0.s[0]
+; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: ushr v2.4s, v3.4s, #16
+; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: fcmeq v2.4s, v3.4s, v3.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: bif v0.16b, v3.16b, v2.16b
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-NEXT: ret
entry:
@@ -378,13 +383,40 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: ushr v5.2d, v2.2d, #32
; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ucvtf v1.2d, v2.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov x9, v5.d[1]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d5
+; CHECK-NEXT: mov v4.s[1], v3.s[0]
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v4.s[2], v3.s[0]
+; CHECK-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v4.s[3], v1.s[0]
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-NEXT: fmul v0.4s, v4.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
@@ -402,19 +434,26 @@ entry:
define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
; CHECK-LABEL: stofp_v4i64_v4bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushr v3.4s, v0.4s, #16
-; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-NEXT: movi v1.4s, #127, msl #8
+; CHECK-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: ushr v3.4s, v2.4s, #16
+; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: fcmeq v3.4s, v2.4s, v2.4s
+; CHECK-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-NEXT: ret
entry:
@@ -425,12 +464,39 @@ entry:
define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
; CHECK-LABEL: utofp_v4i64_v4bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-NEXT: ushr v4.2d, v1.2d, #32
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: scvtf s3, x10
+; CHECK-NEXT: scvtf s5, x8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: scvtf s2, x8
+; CHECK-NEXT: fmov x8, d4
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov x9, v4.d[1]
+; CHECK-NEXT: mov v3.s[1], v5.s[0]
+; CHECK-NEXT: scvtf s4, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v3.s[2], v4.s[0]
+; CHECK-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-NEXT: fmul v0.4s, v3.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
@@ -448,31 +514,46 @@ entry:
define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
; CHECK-LABEL: stofp_v8i64_v8bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v3.2d, v3.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: movi v3.4s, #127, msl #8
-; CHECK-NEXT: ushr v4.4s, v2.4s, #16
-; CHECK-NEXT: ushr v5.4s, v0.4s, #16
-; CHECK-NEXT: add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT: add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT: and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: fmov x9, d3
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: scvtf s3, x9
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-NEXT: scvtf s3, x9
+; CHECK-NEXT: mov v4.s[2], v0.s[0]
+; CHECK-NEXT: movi v0.4s, #1
+; CHECK-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-NEXT: movi v1.4s, #127, msl #8
+; CHECK-NEXT: mov v4.s[3], v3.s[0]
+; CHECK-NEXT: ushr v3.4s, v2.4s, #16
+; CHECK-NEXT: add v6.4s, v2.4s, v1.4s
+; CHECK-NEXT: ushr v5.4s, v4.4s, #16
+; CHECK-NEXT: add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
+; CHECK-NEXT: and v0.16b, v5.16b, v0.16b
; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s
; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT: fcmeq v6.4s, v4.4s, v4.4s
+; CHECK-NEXT: orr v4.4s, #64, lsl #16
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v1.16b, v5.16b
+; CHECK-NEXT: bif v0.16b, v4.16b, v6.16b
+; CHECK-NEXT: bsl v1.16b, v3.16b, v2.16b
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
entry:
%c = sitofp <8 x i64> %a to <8 x bfloat>
@@ -482,31 +563,82 @@ entry:
define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
; CHECK-LABEL: utofp_v8i64_v8bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: ushr v5.2d, v2.2d, #32
+; CHECK-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v6.2d, v0.2d, #32
+; CHECK-NEXT: ushr v7.2d, v3.2d, #32
+; CHECK-NEXT: ushr v16.2d, v1.2d, #32
+; CHECK-NEXT: fmov x10, d5
+; CHECK-NEXT: mov x8, v5.d[1]
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: fmov x12, d7
+; CHECK-NEXT: mov x11, v7.d[1]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: mov x13, v2.d[1]
+; CHECK-NEXT: scvtf s5, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: scvtf s17, x12
+; CHECK-NEXT: fmov x12, d16
+; CHECK-NEXT: scvtf s6, x10
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x9, v16.d[1]
+; CHECK-NEXT: scvtf s16, x13
+; CHECK-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-NEXT: fmov x13, d1
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: mov v6.s[1], v7.s[0]
+; CHECK-NEXT: scvtf s7, x8
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov x12, v1.d[1]
+; CHECK-NEXT: scvtf s5, x10
+; CHECK-NEXT: fmov x10, d3
+; CHECK-NEXT: scvtf s3, x11
+; CHECK-NEXT: mov v2.s[1], v16.s[0]
+; CHECK-NEXT: mov v4.s[2], v17.s[0]
+; CHECK-NEXT: scvtf s16, x13
+; CHECK-NEXT: mov v6.s[2], v0.s[0]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov v5.s[1], v7.s[0]
+; CHECK-NEXT: dup v7.4s, w9
+; CHECK-NEXT: mov v4.s[3], v3.s[0]
+; CHECK-NEXT: scvtf s3, x12
+; CHECK-NEXT: mov v6.s[3], v0.s[0]
+; CHECK-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov v5.s[2], v16.s[0]
+; CHECK-NEXT: fmul v0.4s, v4.4s, v7.4s
+; CHECK-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-NEXT: fmul v1.4s, v6.4s, v7.4s
+; CHECK-NEXT: mov v5.s[3], v3.s[0]
; CHECK-NEXT: movi v3.4s, #127, msl #8
-; CHECK-NEXT: ushr v4.4s, v2.4s, #16
-; CHECK-NEXT: ushr v5.4s, v0.4s, #16
-; CHECK-NEXT: add v6.4s, v2.4s, v3.4s
-; CHECK-NEXT: add v3.4s, v0.4s, v3.4s
-; CHECK-NEXT: and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT: and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s
-; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: fadd v1.4s, v1.4s, v5.4s
+; CHECK-NEXT: ushr v4.4s, v0.4s, #16
+; CHECK-NEXT: add v6.4s, v0.4s, v3.4s
+; CHECK-NEXT: ushr v5.4s, v1.4s, #16
+; CHECK-NEXT: add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT: fcmeq v7.4s, v1.4s, v1.4s
+; CHECK-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-NEXT: and v4.16b, v4.16b, v2.16b
+; CHECK-NEXT: and v2.16b, v5.16b, v2.16b
+; CHECK-NEXT: fcmeq v5.4s, v0.4s, v0.4s
; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: bit v0.16b, v4.16b, v5.16b
+; CHECK-NEXT: bit v1.16b, v2.16b, v7.16b
+; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret
entry:
%c = uitofp <8 x i64> %a to <8 x bfloat>
@@ -516,55 +648,82 @@ entry:
define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
; CHECK-LABEL: stofp_v16i64_v16bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v6.2d, v6.2d
-; CHECK-NEXT: scvtf v4.2d, v4.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: scvtf v3.2d, v3.2d
-; CHECK-NEXT: scvtf v7.2d, v7.2d
-; CHECK-NEXT: scvtf v5.2d, v5.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: mov x9, v0.d[1]
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x12, v6.d[1]
+; CHECK-NEXT: scvtf s2, x11
+; CHECK-NEXT: fmov x11, d3
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: fmov x8, d6
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: mov x10, v4.d[1]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: mov x9, v3.d[1]
+; CHECK-NEXT: scvtf s6, x12
+; CHECK-NEXT: fmov x12, d4
+; CHECK-NEXT: scvtf s4, x11
+; CHECK-NEXT: scvtf s3, x8
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: mov v0.s[1], v16.s[0]
+; CHECK-NEXT: scvtf s18, x10
+; CHECK-NEXT: scvtf s19, x12
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: mov v2.s[1], v17.s[0]
+; CHECK-NEXT: mov x12, v5.d[1]
+; CHECK-NEXT: mov v3.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: fmov x11, d5
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: mov v19.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: mov v0.s[2], v4.s[0]
+; CHECK-NEXT: scvtf s5, x11
+; CHECK-NEXT: mov v3.s[2], v6.s[0]
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: mov v2.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x12
+; CHECK-NEXT: mov v0.s[3], v7.s[0]
+; CHECK-NEXT: mov v19.s[2], v5.s[0]
+; CHECK-NEXT: mov v2.s[3], v16.s[0]
+; CHECK-NEXT: mov v3.s[3], v4.s[0]
+; CHECK-NEXT: movi v4.4s, #127, msl #8
+; CHECK-NEXT: ushr v5.4s, v0.4s, #16
+; CHECK-NEXT: mov v19.s[3], v1.s[0]
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: movi v3.4s, #127, msl #8
-; CHECK-NEXT: ushr v7.4s, v0.4s, #16
-; CHECK-NEXT: ushr v5.4s, v2.4s, #16
-; CHECK-NEXT: ushr v16.4s, v6.4s, #16
-; CHECK-NEXT: ushr v17.4s, v4.4s, #16
-; CHECK-NEXT: add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT: add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT: add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT: add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT: and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT: ushr v6.4s, v2.4s, #16
+; CHECK-NEXT: ushr v7.4s, v3.4s, #16
+; CHECK-NEXT: add v17.4s, v0.4s, v4.4s
+; CHECK-NEXT: add v18.4s, v2.4s, v4.4s
+; CHECK-NEXT: add v20.4s, v3.4s, v4.4s
+; CHECK-NEXT: ushr v16.4s, v19.4s, #16
; CHECK-NEXT: and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT: and v1.16b, v17.16b, v1.16b
+; CHECK-NEXT: add v4.4s, v19.4s, v4.4s
+; CHECK-NEXT: and v6.16b, v6.16b, v1.16b
+; CHECK-NEXT: and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT: and v1.16b, v16.16b, v1.16b
+; CHECK-NEXT: add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT: fcmeq v16.4s, v0.4s, v0.4s
+; CHECK-NEXT: add v6.4s, v6.4s, v18.4s
; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT: add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT: add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT: fcmeq v18.4s, v3.4s, v3.4s
; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: orr v6.4s, #64, lsl #16
-; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT: mov v5.16b, v19.16b
-; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT: add v7.4s, v7.4s, v20.4s
+; CHECK-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: fcmeq v4.4s, v19.4s, v19.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-NEXT: orr v19.4s, #64, lsl #16
+; CHECK-NEXT: bit v0.16b, v5.16b, v16.16b
+; CHECK-NEXT: bit v2.16b, v6.16b, v17.16b
+; CHECK-NEXT: bit v3.16b, v7.16b, v18.16b
+; CHECK-NEXT: bif v1.16b, v19.16b, v4.16b
+; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; CHECK-NEXT: ret
entry:
%c = sitofp <16 x i64> %a to <16 x bfloat>
@@ -574,55 +733,154 @@ entry:
define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
; CHECK-LABEL: utofp_v16i64_v16bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-NEXT: ucvtf v7.2d, v7.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT: fcvtn2 v6.4s, v7.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: movi v3.4s, #127, msl #8
-; CHECK-NEXT: ushr v7.4s, v0.4s, #16
-; CHECK-NEXT: ushr v5.4s, v2.4s, #16
-; CHECK-NEXT: ushr v16.4s, v6.4s, #16
-; CHECK-NEXT: ushr v17.4s, v4.4s, #16
-; CHECK-NEXT: add v19.4s, v0.4s, v3.4s
-; CHECK-NEXT: add v18.4s, v2.4s, v3.4s
-; CHECK-NEXT: add v20.4s, v6.4s, v3.4s
-; CHECK-NEXT: add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT: and v7.16b, v7.16b, v1.16b
-; CHECK-NEXT: and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT: and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT: and v1.16b, v17.16b, v1.16b
-; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s
-; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: add v7.4s, v7.4s, v19.4s
-; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT: add v5.4s, v5.4s, v18.4s
-; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s
-; CHECK-NEXT: add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v18.2d, v2.2d, #32
+; CHECK-NEXT: ushr v22.2d, v0.2d, #32
+; CHECK-NEXT: ushr v19.2d, v3.2d, #32
+; CHECK-NEXT: ushr v21.2d, v1.2d, #32
+; CHECK-NEXT: ushr v20.2d, v6.2d, #32
+; CHECK-NEXT: mov x8, v18.d[1]
+; CHECK-NEXT: fmov x9, d18
+; CHECK-NEXT: mov x11, v22.d[1]
+; CHECK-NEXT: and v17.16b, v2.16b, v16.16b
+; CHECK-NEXT: and v23.16b, v0.16b, v16.16b
+; CHECK-NEXT: and v25.16b, v3.16b, v16.16b
+; CHECK-NEXT: and v18.16b, v1.16b, v16.16b
+; CHECK-NEXT: and v6.16b, v6.16b, v16.16b
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: fmov x9, d19
+; CHECK-NEXT: mov x10, v17.d[1]
+; CHECK-NEXT: fmov x12, d17
+; CHECK-NEXT: mov x13, v23.d[1]
+; CHECK-NEXT: scvtf s24, x8
+; CHECK-NEXT: mov x8, v19.d[1]
+; CHECK-NEXT: fmov x14, d18
+; CHECK-NEXT: scvtf s19, x9
+; CHECK-NEXT: mov x9, v25.d[1]
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: fmov x12, d23
+; CHECK-NEXT: scvtf s17, x10
+; CHECK-NEXT: fmov x10, d22
+; CHECK-NEXT: scvtf s22, x11
+; CHECK-NEXT: scvtf s23, x13
+; CHECK-NEXT: mov v2.s[1], v24.s[0]
+; CHECK-NEXT: mov x11, v6.d[1]
+; CHECK-NEXT: scvtf s1, x12
+; CHECK-NEXT: mov x12, v20.d[1]
+; CHECK-NEXT: mov x13, v18.d[1]
+; CHECK-NEXT: scvtf s3, x10
+; CHECK-NEXT: fmov x10, d25
+; CHECK-NEXT: mov v0.s[1], v17.s[0]
+; CHECK-NEXT: mov v2.s[2], v19.s[0]
+; CHECK-NEXT: scvtf s19, x8
+; CHECK-NEXT: mov x8, v21.d[1]
+; CHECK-NEXT: scvtf s17, x10
+; CHECK-NEXT: fmov x10, d21
+; CHECK-NEXT: mov v1.s[1], v23.s[0]
+; CHECK-NEXT: mov v3.s[1], v22.s[0]
+; CHECK-NEXT: ushr v22.2d, v4.2d, #32
+; CHECK-NEXT: scvtf s23, x9
+; CHECK-NEXT: fmov x9, d6
+; CHECK-NEXT: and v6.16b, v4.16b, v16.16b
+; CHECK-NEXT: ushr v21.2d, v7.2d, #32
+; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: scvtf s19, x11
+; CHECK-NEXT: and v7.16b, v7.16b, v16.16b
+; CHECK-NEXT: mov v0.s[2], v17.s[0]
+; CHECK-NEXT: scvtf s17, x10
+; CHECK-NEXT: fmov x10, d20
+; CHECK-NEXT: scvtf s4, x9
+; CHECK-NEXT: mov x9, v22.d[1]
+; CHECK-NEXT: scvtf s20, x12
+; CHECK-NEXT: mov x11, v21.d[1]
+; CHECK-NEXT: fmov x12, d21
+; CHECK-NEXT: scvtf s21, x10
+; CHECK-NEXT: mov x10, v6.d[1]
+; CHECK-NEXT: mov v3.s[2], v17.s[0]
+; CHECK-NEXT: mov v0.s[3], v23.s[0]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: fmov x9, d22
+; CHECK-NEXT: mov v4.s[1], v19.s[0]
+; CHECK-NEXT: scvtf s18, x12
+; CHECK-NEXT: fmov x12, d7
+; CHECK-NEXT: scvtf s22, x10
+; CHECK-NEXT: mov w10, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v21.s[1], v20.s[0]
+; CHECK-NEXT: scvtf s19, x9
+; CHECK-NEXT: fmov x9, d6
+; CHECK-NEXT: ushr v6.2d, v5.2d, #32
+; CHECK-NEXT: and v5.16b, v5.16b, v16.16b
+; CHECK-NEXT: dup v16.4s, w10
+; CHECK-NEXT: scvtf s20, x9
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: mov v19.s[1], v17.s[0]
+; CHECK-NEXT: mov v21.s[2], v18.s[0]
+; CHECK-NEXT: scvtf s18, x12
+; CHECK-NEXT: scvtf s6, x14
+; CHECK-NEXT: fmul v2.4s, v2.4s, v16.4s
+; CHECK-NEXT: scvtf s17, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: scvtf s7, x8
+; CHECK-NEXT: mov v20.s[1], v22.s[0]
+; CHECK-NEXT: scvtf s22, x11
+; CHECK-NEXT: fmov x11, d5
+; CHECK-NEXT: mov x8, v5.d[1]
+; CHECK-NEXT: mov v4.s[2], v18.s[0]
+; CHECK-NEXT: mov v1.s[2], v6.s[0]
+; CHECK-NEXT: scvtf s6, x13
+; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: scvtf s5, x11
+; CHECK-NEXT: mov v19.s[2], v17.s[0]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-NEXT: scvtf s7, x10
+; CHECK-NEXT: mov v21.s[3], v22.s[0]
+; CHECK-NEXT: mov v1.s[3], v6.s[0]
+; CHECK-NEXT: movi v6.4s, #1
+; CHECK-NEXT: mov v20.s[2], v5.s[0]
+; CHECK-NEXT: scvtf s5, x8
+; CHECK-NEXT: mov v19.s[3], v17.s[0]
+; CHECK-NEXT: mov v4.s[3], v7.s[0]
+; CHECK-NEXT: fmul v2.4s, v21.4s, v16.4s
+; CHECK-NEXT: fmul v3.4s, v3.4s, v16.4s
+; CHECK-NEXT: movi v7.4s, #127, msl #8
+; CHECK-NEXT: mov v20.s[3], v5.s[0]
+; CHECK-NEXT: fmul v5.4s, v19.4s, v16.4s
+; CHECK-NEXT: fcmeq v19.4s, v0.4s, v0.4s
+; CHECK-NEXT: fadd v2.4s, v2.4s, v4.4s
+; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s
+; CHECK-NEXT: ushr v3.4s, v0.4s, #16
+; CHECK-NEXT: fadd v4.4s, v5.4s, v20.4s
+; CHECK-NEXT: and v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: add v5.4s, v0.4s, v7.4s
+; CHECK-NEXT: ushr v17.4s, v2.4s, #16
+; CHECK-NEXT: ushr v16.4s, v1.4s, #16
+; CHECK-NEXT: add v20.4s, v2.4s, v7.4s
; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: orr v6.4s, #64, lsl #16
+; CHECK-NEXT: add v3.4s, v3.4s, v5.4s
+; CHECK-NEXT: and v17.16b, v17.16b, v6.16b
+; CHECK-NEXT: ushr v18.4s, v4.4s, #16
+; CHECK-NEXT: and v5.16b, v16.16b, v6.16b
+; CHECK-NEXT: add v16.4s, v1.4s, v7.4s
+; CHECK-NEXT: add v7.4s, v4.4s, v7.4s
+; CHECK-NEXT: bit v0.16b, v3.16b, v19.16b
+; CHECK-NEXT: add v17.4s, v17.4s, v20.4s
+; CHECK-NEXT: fcmeq v20.4s, v4.4s, v4.4s
+; CHECK-NEXT: and v6.16b, v18.16b, v6.16b
+; CHECK-NEXT: add v5.4s, v5.4s, v16.4s
+; CHECK-NEXT: fcmeq v16.4s, v1.4s, v1.4s
+; CHECK-NEXT: fcmeq v18.4s, v2.4s, v2.4s
+; CHECK-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-NEXT: orr v2.4s, #64, lsl #16
; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT: mov v5.16b, v19.16b
-; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b
-; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b
-; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: uzp2 v1.8h, v1.8h, v5.8h
+; CHECK-NEXT: add v6.4s, v6.4s, v7.4s
+; CHECK-NEXT: mov v3.16b, v20.16b
+; CHECK-NEXT: bit v1.16b, v5.16b, v16.16b
+; CHECK-NEXT: bit v2.16b, v17.16b, v18.16b
+; CHECK-NEXT: bsl v3.16b, v6.16b, v4.16b
+; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: uzp2 v1.8h, v3.8h, v2.8h
; CHECK-NEXT: ret
entry:
%c = uitofp <16 x i64> %a to <16 x bfloat>
@@ -632,107 +890,162 @@ entry:
define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
; CHECK-LABEL: stofp_v32i64_v32bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v17.2d, v2.2d
-; CHECK-NEXT: scvtf v18.2d, v0.2d
-; CHECK-NEXT: scvtf v19.2d, v3.2d
-; CHECK-NEXT: scvtf v3.2d, v6.2d
-; CHECK-NEXT: ldp q21, q20, [sp, #32]
-; CHECK-NEXT: scvtf v4.2d, v4.2d
-; CHECK-NEXT: scvtf v6.2d, v7.2d
-; CHECK-NEXT: scvtf v5.2d, v5.2d
-; CHECK-NEXT: ldp q24, q23, [sp, #64]
-; CHECK-NEXT: movi v16.4s, #1
-; CHECK-NEXT: fcvtn v0.2s, v17.2d
-; CHECK-NEXT: scvtf v17.2d, v1.2d
-; CHECK-NEXT: fcvtn v1.2s, v18.2d
-; CHECK-NEXT: fcvtn v3.2s, v3.2d
-; CHECK-NEXT: ldp q18, q7, [sp]
-; CHECK-NEXT: scvtf v21.2d, v21.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: scvtf v20.2d, v20.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT: ldp q22, q19, [sp, #96]
-; CHECK-NEXT: fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT: fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT: scvtf v18.2d, v18.2d
-; CHECK-NEXT: scvtf v17.2d, v24.2d
-; CHECK-NEXT: fcvtn v6.2s, v21.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT: scvtf v22.2d, v22.2d
-; CHECK-NEXT: scvtf v21.2d, v23.2d
-; CHECK-NEXT: scvtf v7.2d, v7.2d
-; CHECK-NEXT: ushr v24.4s, v0.4s, #16
-; CHECK-NEXT: add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT: scvtf v19.2d, v19.2d
-; CHECK-NEXT: ushr v23.4s, v1.4s, #16
-; CHECK-NEXT: ushr v25.4s, v3.4s, #16
-; CHECK-NEXT: fcvtn v18.2s, v18.2d
-; CHECK-NEXT: fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT: add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT: fcvtn v17.2s, v17.2d
-; CHECK-NEXT: and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT: fcvtn v22.2s, v22.2d
-; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT: and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT: fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT: add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT: orr v3.4s, #64, lsl #16
-; CHECK-NEXT: add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT: and v24.16b, v25.16b, v16.16b
-; CHECK-NEXT: ushr v25.4s, v4.4s, #16
-; CHECK-NEXT: fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT: add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT: ushr v26.4s, v6.4s, #16
-; CHECK-NEXT: fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s
+; CHECK-NEXT: fmov x10, d2
+; CHECK-NEXT: mov x9, v3.d[1]
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x11, d3
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: movi v3.4s, #1
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: scvtf s19, x9
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: scvtf s16, x11
+; CHECK-NEXT: mov x11, v6.d[1]
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: scvtf s18, x8
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: scvtf s20, x10
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: mov x9, v7.d[1]
+; CHECK-NEXT: mov x10, v4.d[1]
+; CHECK-NEXT: scvtf s21, x11
+; CHECK-NEXT: fmov x11, d6
+; CHECK-NEXT: mov v2.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s25, x8
+; CHECK-NEXT: movi v6.4s, #127, msl #8
+; CHECK-NEXT: mov v0.s[1], v20.s[0]
+; CHECK-NEXT: ldp q24, q20, [sp, #32]
+; CHECK-NEXT: scvtf s22, x9
+; CHECK-NEXT: fmov x9, d4
+; CHECK-NEXT: scvtf s1, x11
+; CHECK-NEXT: scvtf s26, x10
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: mov v2.s[2], v16.s[0]
+; CHECK-NEXT: ldp q18, q16, [sp]
+; CHECK-NEXT: mov x8, v24.d[1]
+; CHECK-NEXT: scvtf s4, x9
+; CHECK-NEXT: fmov x9, d5
+; CHECK-NEXT: mov v0.s[2], v17.s[0]
+; CHECK-NEXT: mov v1.s[1], v21.s[0]
+; CHECK-NEXT: scvtf s23, x11
+; CHECK-NEXT: mov x11, v5.d[1]
+; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: scvtf s21, x8
+; CHECK-NEXT: mov x8, v20.d[1]
+; CHECK-NEXT: scvtf s17, x9
+; CHECK-NEXT: fmov x9, d24
+; CHECK-NEXT: mov v4.s[1], v26.s[0]
+; CHECK-NEXT: mov v0.s[3], v25.s[0]
+; CHECK-NEXT: ldp q26, q24, [sp, #96]
+; CHECK-NEXT: mov v1.s[2], v23.s[0]
+; CHECK-NEXT: ldp q25, q23, [sp, #64]
+; CHECK-NEXT: scvtf s7, x11
+; CHECK-NEXT: scvtf s27, x8
+; CHECK-NEXT: fmov x8, d18
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: mov x10, v26.d[1]
+; CHECK-NEXT: mov x9, v18.d[1]
+; CHECK-NEXT: fmov x11, d20
+; CHECK-NEXT: mov v4.s[2], v17.s[0]
+; CHECK-NEXT: mov v1.s[3], v22.s[0]
+; CHECK-NEXT: ushr v19.4s, v2.4s, #16
+; CHECK-NEXT: scvtf s17, x8
+; CHECK-NEXT: fmov x8, d26
+; CHECK-NEXT: add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT: scvtf s22, x11
+; CHECK-NEXT: mov x11, v25.d[1]
+; CHECK-NEXT: mov v5.s[1], v21.s[0]
+; CHECK-NEXT: scvtf s28, x10
+; CHECK-NEXT: fmov x10, d16
+; CHECK-NEXT: scvtf s21, x9
+; CHECK-NEXT: fmov x9, d25
+; CHECK-NEXT: scvtf s18, x8
+; CHECK-NEXT: mov x8, v16.d[1]
+; CHECK-NEXT: mov v4.s[3], v7.s[0]
+; CHECK-NEXT: and v19.16b, v19.16b, v3.16b
+; CHECK-NEXT: scvtf s16, x10
+; CHECK-NEXT: fmov x10, d24
+; CHECK-NEXT: scvtf s25, x11
+; CHECK-NEXT: scvtf s20, x9
+; CHECK-NEXT: mov x9, v24.d[1]
+; CHECK-NEXT: mov v17.s[1], v21.s[0]
+; CHECK-NEXT: fmov x11, d23
+; CHECK-NEXT: mov v18.s[1], v28.s[0]
+; CHECK-NEXT: scvtf s24, x8
+; CHECK-NEXT: scvtf s21, x10
+; CHECK-NEXT: mov x10, v23.d[1]
+; CHECK-NEXT: mov v5.s[2], v22.s[0]
+; CHECK-NEXT: ushr v22.4s, v1.4s, #16
+; CHECK-NEXT: ushr v28.4s, v0.4s, #16
+; CHECK-NEXT: scvtf s23, x11
+; CHECK-NEXT: mov v20.s[1], v25.s[0]
+; CHECK-NEXT: scvtf s25, x9
+; CHECK-NEXT: mov v17.s[2], v16.s[0]
+; CHECK-NEXT: add v16.4s, v19.4s, v26.4s
+; CHECK-NEXT: ushr v26.4s, v4.4s, #16
+; CHECK-NEXT: mov v18.s[2], v21.s[0]
+; CHECK-NEXT: scvtf s7, x10
+; CHECK-NEXT: and v22.16b, v22.16b, v3.16b
+; CHECK-NEXT: mov v5.s[3], v27.s[0]
+; CHECK-NEXT: and v21.16b, v28.16b, v3.16b
+; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s
+; CHECK-NEXT: mov v20.s[2], v23.s[0]
+; CHECK-NEXT: add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT: orr v2.4s, #64, lsl #16
+; CHECK-NEXT: mov v17.s[3], v24.s[0]
+; CHECK-NEXT: add v24.4s, v1.4s, v6.4s
+; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s
+; CHECK-NEXT: mov v18.s[3], v25.s[0]
+; CHECK-NEXT: add v25.4s, v4.4s, v6.4s
; CHECK-NEXT: orr v1.4s, #64, lsl #16
-; CHECK-NEXT: and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT: add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT: add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT: ushr v24.4s, v18.4s, #16
-; CHECK-NEXT: add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT: ushr v28.4s, v22.4s, #16
-; CHECK-NEXT: add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b
+; CHECK-NEXT: mov v20.s[3], v7.s[0]
+; CHECK-NEXT: add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT: add v7.4s, v21.4s, v23.4s
+; CHECK-NEXT: ushr v24.4s, v17.4s, #16
+; CHECK-NEXT: and v23.16b, v26.16b, v3.16b
+; CHECK-NEXT: ushr v26.4s, v5.4s, #16
+; CHECK-NEXT: ushr v28.4s, v18.4s, #16
+; CHECK-NEXT: add v30.4s, v17.4s, v6.4s
+; CHECK-NEXT: add v31.4s, v18.4s, v6.4s
+; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b
+; CHECK-NEXT: ushr v29.4s, v20.4s, #16
+; CHECK-NEXT: and v24.16b, v24.16b, v3.16b
; CHECK-NEXT: add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT: and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT: add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT: ushr v29.4s, v17.4s, #16
-; CHECK-NEXT: and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT: add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT: and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT: orr v6.4s, #64, lsl #16
-; CHECK-NEXT: and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT: and v28.16b, v28.16b, v3.16b
+; CHECK-NEXT: and v25.16b, v26.16b, v3.16b
+; CHECK-NEXT: add v26.4s, v5.4s, v6.4s
+; CHECK-NEXT: add v6.4s, v20.4s, v6.4s
+; CHECK-NEXT: and v3.16b, v29.16b, v3.16b
; CHECK-NEXT: add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s
+; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s
; CHECK-NEXT: add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s
; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s
; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: orr v18.4s, #64, lsl #16
-; CHECK-NEXT: orr v22.4s, #64, lsl #16
-; CHECK-NEXT: mov v5.16b, v26.16b
-; CHECK-NEXT: add v2.4s, v16.4s, v2.4s
-; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT: add v3.4s, v3.4s, v6.4s
+; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s
+; CHECK-NEXT: orr v5.4s, #64, lsl #16
; CHECK-NEXT: orr v17.4s, #64, lsl #16
-; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: mov v7.16b, v31.16b
+; CHECK-NEXT: orr v18.4s, #64, lsl #16
+; CHECK-NEXT: orr v20.4s, #64, lsl #16
+; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b
+; CHECK-NEXT: mov v7.16b, v30.16b
+; CHECK-NEXT: mov v16.16b, v31.16b
; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT: mov v6.16b, v30.16b
-; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b
+; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b
+; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b
+; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h
+; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h
+; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h
; CHECK-NEXT: ret
entry:
%c = sitofp <32 x i64> %a to <32 x bfloat>
@@ -742,107 +1055,301 @@ entry:
define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
; CHECK-LABEL: utofp_v32i64_v32bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v17.2d, v2.2d
-; CHECK-NEXT: ucvtf v18.2d, v0.2d
-; CHECK-NEXT: ucvtf v19.2d, v3.2d
-; CHECK-NEXT: ucvtf v3.2d, v6.2d
-; CHECK-NEXT: ldp q21, q20, [sp, #32]
-; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v6.2d, v7.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: ldp q24, q23, [sp, #64]
-; CHECK-NEXT: movi v16.4s, #1
-; CHECK-NEXT: fcvtn v0.2s, v17.2d
-; CHECK-NEXT: ucvtf v17.2d, v1.2d
-; CHECK-NEXT: fcvtn v1.2s, v18.2d
-; CHECK-NEXT: fcvtn v3.2s, v3.2d
-; CHECK-NEXT: ldp q18, q7, [sp]
-; CHECK-NEXT: ucvtf v21.2d, v21.2d
-; CHECK-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-NEXT: movi v2.4s, #127, msl #8
-; CHECK-NEXT: ucvtf v20.2d, v20.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v19.2d
-; CHECK-NEXT: ldp q22, q19, [sp, #96]
-; CHECK-NEXT: fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT: fcvtn2 v3.4s, v6.2d
-; CHECK-NEXT: ucvtf v18.2d, v18.2d
-; CHECK-NEXT: ucvtf v17.2d, v24.2d
-; CHECK-NEXT: fcvtn v6.2s, v21.2d
-; CHECK-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT: ucvtf v22.2d, v22.2d
-; CHECK-NEXT: ucvtf v21.2d, v23.2d
-; CHECK-NEXT: ucvtf v7.2d, v7.2d
-; CHECK-NEXT: ushr v24.4s, v0.4s, #16
-; CHECK-NEXT: add v5.4s, v0.4s, v2.4s
-; CHECK-NEXT: ucvtf v19.2d, v19.2d
-; CHECK-NEXT: ushr v23.4s, v1.4s, #16
-; CHECK-NEXT: ushr v25.4s, v3.4s, #16
-; CHECK-NEXT: fcvtn v18.2s, v18.2d
-; CHECK-NEXT: fcvtn2 v6.4s, v20.2d
-; CHECK-NEXT: add v26.4s, v1.4s, v2.4s
-; CHECK-NEXT: fcvtn v17.2s, v17.2d
-; CHECK-NEXT: and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT: fcvtn v22.2s, v22.2d
-; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s
-; CHECK-NEXT: and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s
-; CHECK-NEXT: fcvtn2 v18.4s, v7.2d
-; CHECK-NEXT: add v7.4s, v3.4s, v2.4s
-; CHECK-NEXT: orr v3.4s, #64, lsl #16
-; CHECK-NEXT: add v5.4s, v24.4s, v5.4s
-; CHECK-NEXT: and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT: ushr v18.2d, v3.2d, #32
+; CHECK-NEXT: ushr v19.2d, v0.2d, #32
+; CHECK-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v17.2d, v2.2d, #32
+; CHECK-NEXT: ushr v20.2d, v1.2d, #32
+; CHECK-NEXT: ushr v23.2d, v6.2d, #32
+; CHECK-NEXT: fmov x13, d18
+; CHECK-NEXT: mov x10, v18.d[1]
+; CHECK-NEXT: mov x11, v19.d[1]
+; CHECK-NEXT: and v24.16b, v2.16b, v16.16b
+; CHECK-NEXT: mov x8, v17.d[1]
+; CHECK-NEXT: fmov x9, d17
+; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT: and v25.16b, v3.16b, v16.16b
+; CHECK-NEXT: and v2.16b, v1.16b, v16.16b
+; CHECK-NEXT: scvtf s26, x13
+; CHECK-NEXT: fmov x13, d19
+; CHECK-NEXT: and v1.16b, v6.16b, v16.16b
+; CHECK-NEXT: mov x12, v24.d[1]
+; CHECK-NEXT: scvtf s21, x9
+; CHECK-NEXT: scvtf s27, x10
+; CHECK-NEXT: scvtf s6, x8
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: mov x9, v20.d[1]
+; CHECK-NEXT: scvtf s22, x13
+; CHECK-NEXT: fmov x13, d20
+; CHECK-NEXT: mov x8, v25.d[1]
+; CHECK-NEXT: and v31.16b, v4.16b, v16.16b
+; CHECK-NEXT: ushr v4.2d, v4.2d, #32
+; CHECK-NEXT: scvtf s19, x12
+; CHECK-NEXT: scvtf s18, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: scvtf s3, x13
+; CHECK-NEXT: fmov x13, d24
+; CHECK-NEXT: mov v21.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s24, x11
+; CHECK-NEXT: mov x11, v23.d[1]
+; CHECK-NEXT: mov x12, v0.d[1]
+; CHECK-NEXT: scvtf s28, x8
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: scvtf s17, x13
+; CHECK-NEXT: fmov x13, d25
+; CHECK-NEXT: mov v21.s[2], v26.s[0]
+; CHECK-NEXT: ushr v26.2d, v7.2d, #32
+; CHECK-NEXT: and v7.16b, v7.16b, v16.16b
+; CHECK-NEXT: scvtf s29, x11
+; CHECK-NEXT: scvtf s25, x12
+; CHECK-NEXT: mov v22.s[1], v24.s[0]
+; CHECK-NEXT: scvtf s20, x13
+; CHECK-NEXT: mov x13, v1.d[1]
+; CHECK-NEXT: ldp q1, q0, [sp, #64]
+; CHECK-NEXT: mov v17.s[1], v19.s[0]
+; CHECK-NEXT: scvtf s19, x10
+; CHECK-NEXT: fmov x10, d23
+; CHECK-NEXT: fmov x12, d26
+; CHECK-NEXT: mov x11, v26.d[1]
+; CHECK-NEXT: mov v18.s[1], v25.s[0]
+; CHECK-NEXT: mov v21.s[3], v27.s[0]
+; CHECK-NEXT: ldp q25, q24, [sp, #32]
+; CHECK-NEXT: scvtf s30, x13
+; CHECK-NEXT: scvtf s23, x10
+; CHECK-NEXT: mov w10, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v17.s[2], v20.s[0]
+; CHECK-NEXT: ldp q20, q6, [sp]
+; CHECK-NEXT: fmov x13, d7
+; CHECK-NEXT: scvtf s26, x12
+; CHECK-NEXT: fmov x12, d2
+; CHECK-NEXT: mov v22.s[2], v3.s[0]
+; CHECK-NEXT: mov v19.s[1], v30.s[0]
+; CHECK-NEXT: and v30.16b, v5.16b, v16.16b
+; CHECK-NEXT: dup v2.4s, w10
+; CHECK-NEXT: mov v17.s[3], v28.s[0]
+; CHECK-NEXT: mov v23.s[1], v29.s[0]
+; CHECK-NEXT: scvtf s28, x9
+; CHECK-NEXT: mov x9, v7.d[1]
+; CHECK-NEXT: scvtf s7, x13
+; CHECK-NEXT: scvtf s27, x12
+; CHECK-NEXT: mov x12, v31.d[1]
+; CHECK-NEXT: mov x10, v4.d[1]
+; CHECK-NEXT: ushr v5.2d, v5.2d, #32
+; CHECK-NEXT: fmul v21.4s, v21.4s, v2.4s
+; CHECK-NEXT: mov v23.s[2], v26.s[0]
+; CHECK-NEXT: scvtf s26, x11
+; CHECK-NEXT: fmov x11, d31
+; CHECK-NEXT: mov v19.s[2], v7.s[0]
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: fmov x9, d30
+; CHECK-NEXT: scvtf s29, x12
+; CHECK-NEXT: mov v18.s[2], v27.s[0]
+; CHECK-NEXT: scvtf s27, x8
+; CHECK-NEXT: scvtf s3, x11
+; CHECK-NEXT: mov v22.s[3], v28.s[0]
+; CHECK-NEXT: mov x8, v30.d[1]
+; CHECK-NEXT: scvtf s28, x9
+; CHECK-NEXT: fmov x9, d4
+; CHECK-NEXT: ushr v30.2d, v25.2d, #32
+; CHECK-NEXT: mov v23.s[3], v26.s[0]
+; CHECK-NEXT: and v31.16b, v25.16b, v16.16b
+; CHECK-NEXT: mov v19.s[3], v7.s[0]
+; CHECK-NEXT: mov v18.s[3], v27.s[0]
+; CHECK-NEXT: fmov x11, d5
+; CHECK-NEXT: fadd v17.4s, v21.4s, v17.4s
+; CHECK-NEXT: scvtf s4, x9
+; CHECK-NEXT: mov x9, v30.d[1]
+; CHECK-NEXT: mov v3.s[1], v29.s[0]
+; CHECK-NEXT: scvtf s29, x10
+; CHECK-NEXT: mov x10, v5.d[1]
+; CHECK-NEXT: fmov x13, d30
+; CHECK-NEXT: fmul v26.4s, v23.4s, v2.4s
+; CHECK-NEXT: and v23.16b, v24.16b, v16.16b
+; CHECK-NEXT: ushr v24.2d, v24.2d, #32
+; CHECK-NEXT: fmul v25.4s, v22.4s, v2.4s
+; CHECK-NEXT: mov x12, v31.d[1]
+; CHECK-NEXT: scvtf s22, x11
+; CHECK-NEXT: scvtf s27, x9
+; CHECK-NEXT: scvtf s7, x13
+; CHECK-NEXT: mov v3.s[2], v28.s[0]
+; CHECK-NEXT: scvtf s30, x10
+; CHECK-NEXT: fmov x10, d24
+; CHECK-NEXT: scvtf s28, x8
+; CHECK-NEXT: mov x8, v23.d[1]
+; CHECK-NEXT: fmov x11, d31
+; CHECK-NEXT: and v31.16b, v20.16b, v16.16b
+; CHECK-NEXT: mov x9, v24.d[1]
+; CHECK-NEXT: mov v4.s[1], v29.s[0]
+; CHECK-NEXT: ushr v24.2d, v20.2d, #32
+; CHECK-NEXT: mov v7.s[1], v27.s[0]
+; CHECK-NEXT: scvtf s27, x10
+; CHECK-NEXT: fadd v19.4s, v26.4s, v19.4s
+; CHECK-NEXT: scvtf s29, x12
+; CHECK-NEXT: scvtf s5, x11
+; CHECK-NEXT: fmov x11, d23
+; CHECK-NEXT: scvtf s21, x8
+; CHECK-NEXT: fmov x8, d31
+; CHECK-NEXT: mov x10, v31.d[1]
+; CHECK-NEXT: scvtf s26, x9
+; CHECK-NEXT: mov x9, v24.d[1]
+; CHECK-NEXT: mov v4.s[2], v22.s[0]
+; CHECK-NEXT: ldp q23, q22, [sp, #96]
+; CHECK-NEXT: mov v7.s[2], v27.s[0]
+; CHECK-NEXT: scvtf s20, x8
+; CHECK-NEXT: fmov x8, d24
+; CHECK-NEXT: ushr v24.2d, v6.2d, #32
+; CHECK-NEXT: fadd v18.4s, v25.4s, v18.4s
+; CHECK-NEXT: mov v3.s[3], v28.s[0]
+; CHECK-NEXT: and v28.16b, v6.16b, v16.16b
+; CHECK-NEXT: ushr v25.2d, v23.2d, #32
+; CHECK-NEXT: mov v5.s[1], v29.s[0]
+; CHECK-NEXT: scvtf s29, x11
+; CHECK-NEXT: mov v7.s[3], v26.s[0]
+; CHECK-NEXT: scvtf s26, x9
+; CHECK-NEXT: scvtf s6, x8
+; CHECK-NEXT: scvtf s27, x10
+; CHECK-NEXT: fmov x11, d24
+; CHECK-NEXT: fmov x9, d28
+; CHECK-NEXT: mov x10, v25.d[1]
+; CHECK-NEXT: mov x8, v28.d[1]
+; CHECK-NEXT: mov v4.s[3], v30.s[0]
+; CHECK-NEXT: mov v5.s[2], v29.s[0]
+; CHECK-NEXT: and v29.16b, v23.16b, v16.16b
+; CHECK-NEXT: mov v6.s[1], v26.s[0]
+; CHECK-NEXT: scvtf s26, x11
+; CHECK-NEXT: fmov x11, d25
+; CHECK-NEXT: mov v20.s[1], v27.s[0]
+; CHECK-NEXT: ushr v27.2d, v1.2d, #32
+; CHECK-NEXT: scvtf s23, x9
+; CHECK-NEXT: mov x9, v24.d[1]
+; CHECK-NEXT: scvtf s28, x10
+; CHECK-NEXT: and v25.16b, v22.16b, v16.16b
+; CHECK-NEXT: scvtf s24, x11
+; CHECK-NEXT: fmov x10, d29
+; CHECK-NEXT: ushr v22.2d, v22.2d, #32
+; CHECK-NEXT: fmov x13, d27
+; CHECK-NEXT: mov x12, v29.d[1]
+; CHECK-NEXT: mov x11, v27.d[1]
+; CHECK-NEXT: and v29.16b, v1.16b, v16.16b
+; CHECK-NEXT: mov v6.s[2], v26.s[0]
+; CHECK-NEXT: fmul v4.4s, v4.4s, v2.4s
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: fmov x10, d25
+; CHECK-NEXT: scvtf s30, x9
+; CHECK-NEXT: mov v24.s[1], v28.s[0]
+; CHECK-NEXT: scvtf s27, x13
+; CHECK-NEXT: ushr v28.2d, v0.2d, #32
+; CHECK-NEXT: fmov x13, d22
+; CHECK-NEXT: scvtf s31, x12
+; CHECK-NEXT: mov x9, v25.d[1]
+; CHECK-NEXT: mov x12, v29.d[1]
+; CHECK-NEXT: scvtf s25, x11
+; CHECK-NEXT: mov x11, v22.d[1]
+; CHECK-NEXT: scvtf s22, x10
+; CHECK-NEXT: fmov x10, d29
+; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-NEXT: scvtf s29, x13
+; CHECK-NEXT: fmov x13, d28
+; CHECK-NEXT: mov v6.s[3], v30.s[0]
+; CHECK-NEXT: mov v1.s[1], v31.s[0]
+; CHECK-NEXT: scvtf s30, x9
+; CHECK-NEXT: mov v20.s[2], v23.s[0]
+; CHECK-NEXT: scvtf s16, x12
+; CHECK-NEXT: mov x12, v28.d[1]
+; CHECK-NEXT: scvtf s28, x10
+; CHECK-NEXT: mov v27.s[1], v25.s[0]
+; CHECK-NEXT: scvtf s25, x13
+; CHECK-NEXT: fmov x13, d0
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: scvtf s0, x11
+; CHECK-NEXT: mov v24.s[2], v29.s[0]
+; CHECK-NEXT: mov v1.s[2], v22.s[0]
+; CHECK-NEXT: movi v29.4s, #1
+; CHECK-NEXT: movi v23.4s, #127, msl #8
+; CHECK-NEXT: scvtf s26, x13
+; CHECK-NEXT: scvtf s31, x12
+; CHECK-NEXT: mov v28.s[1], v16.s[0]
+; CHECK-NEXT: mov v27.s[2], v25.s[0]
+; CHECK-NEXT: ushr v16.4s, v17.4s, #16
+; CHECK-NEXT: scvtf s25, x8
+; CHECK-NEXT: mov v24.s[3], v0.s[0]
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: fadd v3.4s, v4.4s, v3.4s
+; CHECK-NEXT: mov v5.s[3], v21.s[0]
+; CHECK-NEXT: fmul v4.4s, v7.4s, v2.4s
+; CHECK-NEXT: mov v1.s[3], v30.s[0]
+; CHECK-NEXT: mov v28.s[2], v26.s[0]
+; CHECK-NEXT: and v16.16b, v16.16b, v29.16b
+; CHECK-NEXT: add v22.4s, v17.4s, v23.4s
+; CHECK-NEXT: mov v27.s[3], v31.s[0]
+; CHECK-NEXT: ushr v26.4s, v18.4s, #16
+; CHECK-NEXT: mov v20.s[3], v25.s[0]
+; CHECK-NEXT: fmul v7.4s, v24.4s, v2.4s
+; CHECK-NEXT: fmul v6.4s, v6.4s, v2.4s
+; CHECK-NEXT: ushr v31.4s, v19.4s, #16
+; CHECK-NEXT: fadd v4.4s, v4.4s, v5.4s
+; CHECK-NEXT: ushr v24.4s, v3.4s, #16
+; CHECK-NEXT: mov v28.s[3], v0.s[0]
+; CHECK-NEXT: and v21.16b, v26.16b, v29.16b
+; CHECK-NEXT: fcmeq v26.4s, v19.4s, v19.4s
+; CHECK-NEXT: fmul v0.4s, v27.4s, v2.4s
+; CHECK-NEXT: add v2.4s, v16.4s, v22.4s
+; CHECK-NEXT: add v22.4s, v18.4s, v23.4s
+; CHECK-NEXT: fadd v1.4s, v7.4s, v1.4s
+; CHECK-NEXT: fadd v6.4s, v6.4s, v20.4s
+; CHECK-NEXT: and v5.16b, v31.16b, v29.16b
+; CHECK-NEXT: add v20.4s, v19.4s, v23.4s
; CHECK-NEXT: ushr v25.4s, v4.4s, #16
-; CHECK-NEXT: fcvtn2 v22.4s, v19.2d
-; CHECK-NEXT: add v19.4s, v23.4s, v26.4s
-; CHECK-NEXT: ushr v26.4s, v6.4s, #16
-; CHECK-NEXT: fcvtn2 v17.4s, v21.2d
-; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s
-; CHECK-NEXT: orr v1.4s, #64, lsl #16
-; CHECK-NEXT: and v23.16b, v25.16b, v16.16b
-; CHECK-NEXT: add v25.4s, v4.4s, v2.4s
-; CHECK-NEXT: add v7.4s, v24.4s, v7.4s
-; CHECK-NEXT: ushr v24.4s, v18.4s, #16
-; CHECK-NEXT: add v30.4s, v18.4s, v2.4s
-; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b
-; CHECK-NEXT: ushr v28.4s, v22.4s, #16
-; CHECK-NEXT: add v31.4s, v22.4s, v2.4s
-; CHECK-NEXT: add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT: and v25.16b, v26.16b, v16.16b
-; CHECK-NEXT: add v26.4s, v6.4s, v2.4s
-; CHECK-NEXT: ushr v29.4s, v17.4s, #16
-; CHECK-NEXT: and v24.16b, v24.16b, v16.16b
-; CHECK-NEXT: add v2.4s, v17.4s, v2.4s
-; CHECK-NEXT: and v28.16b, v28.16b, v16.16b
-; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b
-; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b
-; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s
-; CHECK-NEXT: orr v6.4s, #64, lsl #16
-; CHECK-NEXT: and v16.16b, v29.16b, v16.16b
-; CHECK-NEXT: add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s
-; CHECK-NEXT: add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s
-; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s
-; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: orr v18.4s, #64, lsl #16
-; CHECK-NEXT: orr v22.4s, #64, lsl #16
-; CHECK-NEXT: mov v5.16b, v26.16b
-; CHECK-NEXT: add v2.4s, v16.4s, v2.4s
; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT: add v7.4s, v21.4s, v22.4s
+; CHECK-NEXT: and v22.16b, v24.16b, v29.16b
+; CHECK-NEXT: add v24.4s, v3.4s, v23.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v28.4s
; CHECK-NEXT: orr v17.4s, #64, lsl #16
-; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: mov v7.16b, v31.16b
-; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b
-; CHECK-NEXT: mov v6.16b, v30.16b
-; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b
-; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b
-; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b
-; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h
-; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h
-; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h
+; CHECK-NEXT: fcmeq v21.4s, v18.4s, v18.4s
+; CHECK-NEXT: ushr v27.4s, v1.4s, #16
+; CHECK-NEXT: add v5.4s, v5.4s, v20.4s
+; CHECK-NEXT: ushr v20.4s, v6.4s, #16
+; CHECK-NEXT: add v22.4s, v22.4s, v24.4s
+; CHECK-NEXT: and v24.16b, v25.16b, v29.16b
+; CHECK-NEXT: add v25.4s, v4.4s, v23.4s
+; CHECK-NEXT: add v30.4s, v6.4s, v23.4s
+; CHECK-NEXT: add v31.4s, v1.4s, v23.4s
+; CHECK-NEXT: orr v18.4s, #64, lsl #16
+; CHECK-NEXT: ushr v28.4s, v0.4s, #16
+; CHECK-NEXT: and v27.16b, v27.16b, v29.16b
+; CHECK-NEXT: add v23.4s, v0.4s, v23.4s
+; CHECK-NEXT: and v20.16b, v20.16b, v29.16b
+; CHECK-NEXT: add v24.4s, v24.4s, v25.4s
+; CHECK-NEXT: fcmeq v25.4s, v4.4s, v4.4s
+; CHECK-NEXT: orr v19.4s, #64, lsl #16
+; CHECK-NEXT: orr v4.4s, #64, lsl #16
+; CHECK-NEXT: bif v2.16b, v17.16b, v16.16b
+; CHECK-NEXT: and v28.16b, v28.16b, v29.16b
+; CHECK-NEXT: add v27.4s, v27.4s, v31.4s
+; CHECK-NEXT: fcmeq v31.4s, v1.4s, v1.4s
+; CHECK-NEXT: fcmeq v29.4s, v3.4s, v3.4s
+; CHECK-NEXT: add v20.4s, v20.4s, v30.4s
+; CHECK-NEXT: fcmeq v30.4s, v6.4s, v6.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
+; CHECK-NEXT: orr v6.4s, #64, lsl #16
+; CHECK-NEXT: orr v1.4s, #64, lsl #16
+; CHECK-NEXT: add v23.4s, v28.4s, v23.4s
+; CHECK-NEXT: fcmeq v28.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: mov v16.16b, v31.16b
+; CHECK-NEXT: bif v7.16b, v18.16b, v21.16b
+; CHECK-NEXT: bif v5.16b, v19.16b, v26.16b
+; CHECK-NEXT: bit v3.16b, v22.16b, v29.16b
+; CHECK-NEXT: bit v4.16b, v24.16b, v25.16b
+; CHECK-NEXT: bit v6.16b, v20.16b, v30.16b
+; CHECK-NEXT: mov v17.16b, v28.16b
+; CHECK-NEXT: bsl v16.16b, v27.16b, v1.16b
+; CHECK-NEXT: uzp2 v1.8h, v3.8h, v5.8h
+; CHECK-NEXT: bsl v17.16b, v23.16b, v0.16b
+; CHECK-NEXT: uzp2 v0.8h, v7.8h, v2.8h
+; CHECK-NEXT: uzp2 v2.8h, v6.8h, v4.8h
+; CHECK-NEXT: uzp2 v3.8h, v17.8h, v16.8h
; CHECK-NEXT: ret
entry:
%c = uitofp <32 x i64> %a to <32 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 81c1a64f2d434..d44ea4a529e05 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4421,22 +4421,53 @@ entry:
}
define <2 x float> @stofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: stofp_v2i64_v2f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v2i64_v2f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: fmov x9, d0
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v2i64_v2f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <2 x i64> %a to <2 x float>
ret <2 x float> %c
}
define <2 x float> @utofp_v2i64_v2f32(<2 x i64> %a) {
-; CHECK-LABEL: utofp_v2i64_v2f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v2i64_v2f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #32
+; CHECK-SD-NEXT: mov x8, v2.d[1]
+; CHECK-SD-NEXT: fmov x9, d2
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: scvtf s2, x9
+; CHECK-SD-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: dup v3.2s, w9
+; CHECK-SD-NEXT: fmov x9, d0
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-SD-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-SD-NEXT: fadd v0.2s, v2.2s, v1.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v2i64_v2f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %c
@@ -4446,13 +4477,18 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-SD-LABEL: stofp_v3i64_v3f32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: scvtf v1.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: fmov x8, d2
+; CHECK-SD-NEXT: mov v0.s[0], v3.s[0]
+; CHECK-SD-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: mov v0.s[2], v1.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v3i64_v3f32:
@@ -4480,11 +4516,38 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: ushr v5.2d, v2.2d, #32
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: ucvtf v1.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: fmov x10, d3
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: scvtf s4, x10
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov x9, v5.d[1]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x8, d5
+; CHECK-SD-NEXT: mov v4.s[1], v3.s[0]
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v4.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v4.s[3], v1.s[0]
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT: fmul v0.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v3i64_v3f32:
@@ -4507,26 +4570,76 @@ entry:
}
define <4 x float> @stofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: stofp_v4i64_v4f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v4i64_v4f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: fmov x9, d0
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov x9, v1.d[1]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v4i64_v4f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: scvtf v1.2d, v1.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %c
}
define <4 x float> @utofp_v4i64_v4f32(<4 x i64> %a) {
-; CHECK-LABEL: utofp_v4i64_v4f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v4i64_v4f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v4.2d, v1.2d, #32
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: fmov x10, d3
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT: scvtf s3, x10
+; CHECK-SD-NEXT: scvtf s5, x8
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x8, d4
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov x9, v4.d[1]
+; CHECK-SD-NEXT: mov v3.s[1], v5.s[0]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v3.s[2], v4.s[0]
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT: fmul v0.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v4i64_v4f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: ucvtf v1.2d, v1.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %c
@@ -4535,14 +4648,29 @@ entry:
define <8 x float> @stofp_v8i64_v8f32(<8 x i64> %a) {
; CHECK-SD-LABEL: stofp_v8i64_v8f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v4.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT: fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: mov x9, v2.d[1]
+; CHECK-SD-NEXT: fmov x10, d0
+; CHECK-SD-NEXT: fmov x11, d2
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: mov x10, v3.d[1]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: scvtf s2, x11
+; CHECK-SD-NEXT: fmov x9, d1
+; CHECK-SD-NEXT: fmov x11, d3
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v0.s[1], v4.s[0]
+; CHECK-SD-NEXT: scvtf s3, x11
+; CHECK-SD-NEXT: mov v2.s[1], v5.s[0]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x10
+; CHECK-SD-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-SD-NEXT: mov v1.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v8i64_v8f32:
@@ -4564,14 +4692,65 @@ entry:
define <8 x float> @utofp_v8i64_v8f32(<8 x i64> %a) {
; CHECK-SD-LABEL: utofp_v8i64_v8f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v4.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v4.2d
-; CHECK-SD-NEXT: fcvtn2 v1.4s, v2.2d
+; CHECK-SD-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v5.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v6.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v7.2d, v1.2d, #32
+; CHECK-SD-NEXT: ushr v16.2d, v3.2d, #32
+; CHECK-SD-NEXT: mov x8, v5.d[1]
+; CHECK-SD-NEXT: mov x9, v6.d[1]
+; CHECK-SD-NEXT: fmov x10, d5
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT: fmov x13, d6
+; CHECK-SD-NEXT: fmov x12, d7
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-SD-NEXT: mov x11, v7.d[1]
+; CHECK-SD-NEXT: scvtf s5, x10
+; CHECK-SD-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT: mov x10, v0.d[1]
+; CHECK-SD-NEXT: scvtf s6, x8
+; CHECK-SD-NEXT: mov x8, v2.d[1]
+; CHECK-SD-NEXT: scvtf s4, x13
+; CHECK-SD-NEXT: scvtf s7, x9
+; CHECK-SD-NEXT: fmov x9, d16
+; CHECK-SD-NEXT: scvtf s17, x12
+; CHECK-SD-NEXT: fmov x12, d0
+; CHECK-SD-NEXT: fmov x13, d2
+; CHECK-SD-NEXT: scvtf s2, x10
+; CHECK-SD-NEXT: mov v5.s[1], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x8
+; CHECK-SD-NEXT: scvtf s0, x12
+; CHECK-SD-NEXT: scvtf s18, x13
+; CHECK-SD-NEXT: mov x8, v16.d[1]
+; CHECK-SD-NEXT: mov v4.s[1], v7.s[0]
+; CHECK-SD-NEXT: scvtf s7, x9
+; CHECK-SD-NEXT: fmov x10, d1
+; CHECK-SD-NEXT: fmov x13, d3
+; CHECK-SD-NEXT: mov x9, v1.d[1]
+; CHECK-SD-NEXT: mov x12, v3.d[1]
+; CHECK-SD-NEXT: mov v5.s[2], v17.s[0]
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: scvtf s1, x10
+; CHECK-SD-NEXT: mov v18.s[1], v6.s[0]
+; CHECK-SD-NEXT: scvtf s2, x11
+; CHECK-SD-NEXT: scvtf s3, x13
+; CHECK-SD-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-SD-NEXT: scvtf s6, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v5.s[3], v2.s[0]
+; CHECK-SD-NEXT: scvtf s2, x12
+; CHECK-SD-NEXT: mov v18.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v6.s[0]
+; CHECK-SD-NEXT: dup v3.4s, w8
+; CHECK-SD-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-SD-NEXT: fmul v1.4s, v5.4s, v3.4s
+; CHECK-SD-NEXT: mov v18.s[3], v2.s[0]
+; CHECK-SD-NEXT: fmul v2.4s, v4.4s, v3.4s
+; CHECK-SD-NEXT: fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: fadd v1.4s, v2.4s, v18.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v8i64_v8f32:
@@ -4591,50 +4770,218 @@ entry:
}
define <16 x float> @stofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: stofp_v16i64_v16f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v4.2d, v4.2d
-; CHECK-NEXT: scvtf v6.2d, v6.2d
-; CHECK-NEXT: scvtf v16.2d, v1.2d
-; CHECK-NEXT: scvtf v17.2d, v3.2d
-; CHECK-NEXT: scvtf v5.2d, v5.2d
-; CHECK-NEXT: scvtf v7.2d, v7.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-NEXT: fcvtn v2.2s, v4.2d
-; CHECK-NEXT: fcvtn v3.2s, v6.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT: fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT: fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: stofp_v16i64_v16f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov x13, d2
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: mov x10, v2.d[1]
+; CHECK-SD-NEXT: fmov x11, d0
+; CHECK-SD-NEXT: mov x12, v4.d[1]
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s16, x13
+; CHECK-SD-NEXT: fmov x13, d4
+; CHECK-SD-NEXT: scvtf s0, x11
+; CHECK-SD-NEXT: mov x11, v6.d[1]
+; CHECK-SD-NEXT: scvtf s17, x9
+; CHECK-SD-NEXT: scvtf s18, x10
+; CHECK-SD-NEXT: fmov x9, d1
+; CHECK-SD-NEXT: scvtf s1, x12
+; CHECK-SD-NEXT: fmov x12, d6
+; CHECK-SD-NEXT: scvtf s2, x13
+; CHECK-SD-NEXT: fmov x13, d3
+; CHECK-SD-NEXT: mov x10, v3.d[1]
+; CHECK-SD-NEXT: scvtf s4, x11
+; CHECK-SD-NEXT: mov v0.s[1], v17.s[0]
+; CHECK-SD-NEXT: scvtf s6, x9
+; CHECK-SD-NEXT: scvtf s3, x12
+; CHECK-SD-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-SD-NEXT: mov x9, v5.d[1]
+; CHECK-SD-NEXT: fmov x11, d5
+; CHECK-SD-NEXT: scvtf s5, x13
+; CHECK-SD-NEXT: fmov x13, d7
+; CHECK-SD-NEXT: mov x12, v7.d[1]
+; CHECK-SD-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-SD-NEXT: mov v0.s[2], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x10
+; CHECK-SD-NEXT: scvtf s7, x11
+; CHECK-SD-NEXT: scvtf s1, x13
+; CHECK-SD-NEXT: mov v3.s[1], v4.s[0]
+; CHECK-SD-NEXT: mov v16.s[2], v5.s[0]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: mov v2.s[2], v7.s[0]
+; CHECK-SD-NEXT: mov v3.s[2], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x12
+; CHECK-SD-NEXT: mov v16.s[3], v6.s[0]
+; CHECK-SD-NEXT: mov v0.s[3], v4.s[0]
+; CHECK-SD-NEXT: mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT: mov v1.16b, v16.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: stofp_v16i64_v16f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: scvtf v2.2d, v2.2d
+; CHECK-GI-NEXT: scvtf v4.2d, v4.2d
+; CHECK-GI-NEXT: scvtf v6.2d, v6.2d
+; CHECK-GI-NEXT: scvtf v16.2d, v1.2d
+; CHECK-GI-NEXT: scvtf v17.2d, v3.2d
+; CHECK-GI-NEXT: scvtf v5.2d, v5.2d
+; CHECK-GI-NEXT: scvtf v7.2d, v7.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT: fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT: fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT: fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT: fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT: fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT: fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT: ret
entry:
%c = sitofp <16 x i64> %a to <16 x float>
ret <16 x float> %c
}
define <16 x float> @utofp_v16i64_v16f32(<16 x i64> %a) {
-; CHECK-LABEL: utofp_v16i64_v16f32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: ucvtf v16.2d, v1.2d
-; CHECK-NEXT: ucvtf v17.2d, v3.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: ucvtf v7.2d, v7.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-NEXT: fcvtn v2.2s, v4.2d
-; CHECK-NEXT: fcvtn v3.2s, v6.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v16.2d
-; CHECK-NEXT: fcvtn2 v1.4s, v17.2d
-; CHECK-NEXT: fcvtn2 v2.4s, v5.2d
-; CHECK-NEXT: fcvtn2 v3.4s, v7.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: utofp_v16i64_v16f32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v17.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v18.2d, v1.2d, #32
+; CHECK-SD-NEXT: ushr v20.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v19.2d, v3.2d, #32
+; CHECK-SD-NEXT: ushr v22.2d, v6.2d, #32
+; CHECK-SD-NEXT: ushr v21.2d, v4.2d, #32
+; CHECK-SD-NEXT: mov x8, v17.d[1]
+; CHECK-SD-NEXT: fmov x9, d17
+; CHECK-SD-NEXT: mov x10, v18.d[1]
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT: and v23.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT: and v17.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT: mov x12, v20.d[1]
+; CHECK-SD-NEXT: fmov x13, d20
+; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov x9, v19.d[1]
+; CHECK-SD-NEXT: mov x11, v0.d[1]
+; CHECK-SD-NEXT: scvtf s24, x8
+; CHECK-SD-NEXT: fmov x8, d18
+; CHECK-SD-NEXT: and v18.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: scvtf s2, x13
+; CHECK-SD-NEXT: fmov x13, d17
+; CHECK-SD-NEXT: scvtf s25, x8
+; CHECK-SD-NEXT: mov x8, v23.d[1]
+; CHECK-SD-NEXT: scvtf s20, x11
+; CHECK-SD-NEXT: fmov x11, d23
+; CHECK-SD-NEXT: scvtf s0, x14
+; CHECK-SD-NEXT: mov x14, v17.d[1]
+; CHECK-SD-NEXT: scvtf s23, x12
+; CHECK-SD-NEXT: scvtf s4, x13
+; CHECK-SD-NEXT: fmov x12, d19
+; CHECK-SD-NEXT: scvtf s19, x10
+; CHECK-SD-NEXT: fmov x10, d18
+; CHECK-SD-NEXT: scvtf s17, x11
+; CHECK-SD-NEXT: mov x11, v18.d[1]
+; CHECK-SD-NEXT: mov v1.s[1], v24.s[0]
+; CHECK-SD-NEXT: mov v0.s[1], v20.s[0]
+; CHECK-SD-NEXT: and v24.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s20, x14
+; CHECK-SD-NEXT: scvtf s3, x10
+; CHECK-SD-NEXT: mov x10, v21.d[1]
+; CHECK-SD-NEXT: fmov x14, d21
+; CHECK-SD-NEXT: mov v2.s[1], v23.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: mov x11, v22.d[1]
+; CHECK-SD-NEXT: mov v1.s[2], v25.s[0]
+; CHECK-SD-NEXT: mov v0.s[2], v17.s[0]
+; CHECK-SD-NEXT: ushr v17.2d, v5.2d, #32
+; CHECK-SD-NEXT: fmov x13, d24
+; CHECK-SD-NEXT: mov v4.s[1], v20.s[0]
+; CHECK-SD-NEXT: and v5.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s25, x12
+; CHECK-SD-NEXT: mov x12, v24.d[1]
+; CHECK-SD-NEXT: scvtf s20, x11
+; CHECK-SD-NEXT: fmov x11, d22
+; CHECK-SD-NEXT: mov v1.s[3], v19.s[0]
+; CHECK-SD-NEXT: mov v3.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x10
+; CHECK-SD-NEXT: scvtf s19, x14
+; CHECK-SD-NEXT: mov x14, v17.d[1]
+; CHECK-SD-NEXT: mov x10, v6.d[1]
+; CHECK-SD-NEXT: mov v2.s[2], v25.s[0]
+; CHECK-SD-NEXT: scvtf s21, x11
+; CHECK-SD-NEXT: fmov x11, d17
+; CHECK-SD-NEXT: ushr v17.2d, v7.2d, #32
+; CHECK-SD-NEXT: and v7.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s16, x13
+; CHECK-SD-NEXT: fmov x13, d5
+; CHECK-SD-NEXT: mov v19.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s22, x11
+; CHECK-SD-NEXT: fmov x11, d6
+; CHECK-SD-NEXT: scvtf s6, x10
+; CHECK-SD-NEXT: mov x10, v17.d[1]
+; CHECK-SD-NEXT: mov v21.s[1], v20.s[0]
+; CHECK-SD-NEXT: scvtf s20, x13
+; CHECK-SD-NEXT: fmov x13, d7
+; CHECK-SD-NEXT: mov v4.s[2], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: fmov x11, d17
+; CHECK-SD-NEXT: mov v19.s[2], v22.s[0]
+; CHECK-SD-NEXT: mov v3.s[2], v20.s[0]
+; CHECK-SD-NEXT: scvtf s20, x12
+; CHECK-SD-NEXT: scvtf s17, x11
+; CHECK-SD-NEXT: mov x11, v5.d[1]
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: mov v18.s[1], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x14
+; CHECK-SD-NEXT: mov x9, v7.d[1]
+; CHECK-SD-NEXT: scvtf s7, x13
+; CHECK-SD-NEXT: mov v0.s[3], v16.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v20.s[0]
+; CHECK-SD-NEXT: mov v21.s[2], v17.s[0]
+; CHECK-SD-NEXT: scvtf s17, x10
+; CHECK-SD-NEXT: mov v2.s[3], v5.s[0]
+; CHECK-SD-NEXT: mov v19.s[3], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x11
+; CHECK-SD-NEXT: dup v5.4s, w8
+; CHECK-SD-NEXT: mov v18.s[2], v7.s[0]
+; CHECK-SD-NEXT: scvtf s7, x9
+; CHECK-SD-NEXT: mov v21.s[3], v17.s[0]
+; CHECK-SD-NEXT: fmul v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT: mov v3.s[3], v6.s[0]
+; CHECK-SD-NEXT: fmul v6.4s, v19.4s, v5.4s
+; CHECK-SD-NEXT: mov v18.s[3], v7.s[0]
+; CHECK-SD-NEXT: fmul v5.4s, v21.4s, v5.4s
+; CHECK-SD-NEXT: fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: fadd v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT: fadd v2.4s, v6.4s, v3.4s
+; CHECK-SD-NEXT: fadd v3.4s, v5.4s, v18.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: utofp_v16i64_v16f32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-GI-NEXT: ucvtf v4.2d, v4.2d
+; CHECK-GI-NEXT: ucvtf v6.2d, v6.2d
+; CHECK-GI-NEXT: ucvtf v16.2d, v1.2d
+; CHECK-GI-NEXT: ucvtf v17.2d, v3.2d
+; CHECK-GI-NEXT: ucvtf v5.2d, v5.2d
+; CHECK-GI-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-GI-NEXT: fcvtn v1.2s, v2.2d
+; CHECK-GI-NEXT: fcvtn v2.2s, v4.2d
+; CHECK-GI-NEXT: fcvtn v3.2s, v6.2d
+; CHECK-GI-NEXT: fcvtn2 v0.4s, v16.2d
+; CHECK-GI-NEXT: fcvtn2 v1.4s, v17.2d
+; CHECK-GI-NEXT: fcvtn2 v2.4s, v5.2d
+; CHECK-GI-NEXT: fcvtn2 v3.4s, v7.2d
+; CHECK-GI-NEXT: ret
entry:
%c = uitofp <16 x i64> %a to <16 x float>
ret <16 x float> %c
@@ -4643,42 +4990,99 @@ entry:
define <32 x float> @stofp_v32i64_v32f32(<32 x i64> %a) {
; CHECK-SD-LABEL: stofp_v32i64_v32f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ldp q21, q20, [sp]
-; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT: scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: scvtf v19.2d, v19.2d
-; CHECK-SD-NEXT: scvtf v17.2d, v17.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: scvtf v21.2d, v21.2d
-; CHECK-SD-NEXT: scvtf v24.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT: scvtf v23.2d, v23.2d
-; CHECK-SD-NEXT: scvtf v25.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT: scvtf v26.2d, v5.2d
-; CHECK-SD-NEXT: fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT: scvtf v27.2d, v7.2d
-; CHECK-SD-NEXT: scvtf v20.2d, v20.2d
-; CHECK-SD-NEXT: fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT: scvtf v18.2d, v18.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT: scvtf v16.2d, v16.2d
-; CHECK-SD-NEXT: fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT: scvtf v17.2d, v22.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT: fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT: fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT: fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT: fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: fmov x10, d0
+; CHECK-SD-NEXT: mov v16.16b, v1.16b
+; CHECK-SD-NEXT: fmov x11, d2
+; CHECK-SD-NEXT: ldp q24, q20, [sp]
+; CHECK-SD-NEXT: mov x9, v2.d[1]
+; CHECK-SD-NEXT: fmov x12, d3
+; CHECK-SD-NEXT: fmov x13, d4
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: ldp q21, q18, [sp, #32]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: scvtf s1, x11
+; CHECK-SD-NEXT: mov x10, v4.d[1]
+; CHECK-SD-NEXT: fmov x11, d16
+; CHECK-SD-NEXT: ldp q19, q17, [sp, #96]
+; CHECK-SD-NEXT: scvtf s22, x9
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: scvtf s4, x12
+; CHECK-SD-NEXT: mov x12, v24.d[1]
+; CHECK-SD-NEXT: mov x9, v16.d[1]
+; CHECK-SD-NEXT: scvtf s3, x11
+; CHECK-SD-NEXT: ldp q23, q16, [sp, #64]
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: scvtf s25, x10
+; CHECK-SD-NEXT: fmov x10, d6
+; CHECK-SD-NEXT: mov v1.s[1], v22.s[0]
+; CHECK-SD-NEXT: mov x11, v6.d[1]
+; CHECK-SD-NEXT: scvtf s2, x13
+; CHECK-SD-NEXT: mov x13, v21.d[1]
+; CHECK-SD-NEXT: fmov x14, d19
+; CHECK-SD-NEXT: scvtf s22, x9
+; CHECK-SD-NEXT: mov x9, v5.d[1]
+; CHECK-SD-NEXT: fmov x15, d17
+; CHECK-SD-NEXT: mov v0.s[2], v3.s[0]
+; CHECK-SD-NEXT: scvtf s3, x10
+; CHECK-SD-NEXT: fmov x10, d24
+; CHECK-SD-NEXT: mov v1.s[2], v4.s[0]
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: scvtf s6, x11
+; CHECK-SD-NEXT: fmov x11, d5
+; CHECK-SD-NEXT: fmov x12, d7
+; CHECK-SD-NEXT: mov v2.s[1], v25.s[0]
+; CHECK-SD-NEXT: scvtf s4, x10
+; CHECK-SD-NEXT: fmov x10, d21
+; CHECK-SD-NEXT: scvtf s21, x8
+; CHECK-SD-NEXT: mov x8, v23.d[1]
+; CHECK-SD-NEXT: scvtf s25, x13
+; CHECK-SD-NEXT: mov x13, v19.d[1]
+; CHECK-SD-NEXT: scvtf s26, x11
+; CHECK-SD-NEXT: mov x11, v20.d[1]
+; CHECK-SD-NEXT: mov v3.s[1], v6.s[0]
+; CHECK-SD-NEXT: scvtf s5, x10
+; CHECK-SD-NEXT: mov x10, v7.d[1]
+; CHECK-SD-NEXT: scvtf s7, x14
+; CHECK-SD-NEXT: mov v4.s[1], v24.s[0]
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: fmov x12, d20
+; CHECK-SD-NEXT: scvtf s20, x8
+; CHECK-SD-NEXT: fmov x8, d23
+; CHECK-SD-NEXT: scvtf s19, x13
+; CHECK-SD-NEXT: fmov x13, d18
+; CHECK-SD-NEXT: fmov x14, d16
+; CHECK-SD-NEXT: mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT: mov v5.s[1], v25.s[0]
+; CHECK-SD-NEXT: scvtf s23, x10
+; CHECK-SD-NEXT: mov v0.s[3], v22.s[0]
+; CHECK-SD-NEXT: scvtf s6, x8
+; CHECK-SD-NEXT: mov x8, v18.d[1]
+; CHECK-SD-NEXT: scvtf s18, x12
+; CHECK-SD-NEXT: mov x12, v16.d[1]
+; CHECK-SD-NEXT: scvtf s16, x13
+; CHECK-SD-NEXT: mov x13, v17.d[1]
+; CHECK-SD-NEXT: scvtf s17, x14
+; CHECK-SD-NEXT: mov v7.s[1], v19.s[0]
+; CHECK-SD-NEXT: scvtf s19, x9
+; CHECK-SD-NEXT: mov v3.s[2], v24.s[0]
+; CHECK-SD-NEXT: scvtf s24, x11
+; CHECK-SD-NEXT: mov v1.s[3], v21.s[0]
+; CHECK-SD-NEXT: mov v6.s[1], v20.s[0]
+; CHECK-SD-NEXT: scvtf s20, x15
+; CHECK-SD-NEXT: mov v4.s[2], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x8
+; CHECK-SD-NEXT: mov v5.s[2], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x12
+; CHECK-SD-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-SD-NEXT: mov v3.s[3], v23.s[0]
+; CHECK-SD-NEXT: mov v6.s[2], v17.s[0]
+; CHECK-SD-NEXT: mov v7.s[2], v20.s[0]
+; CHECK-SD-NEXT: scvtf s17, x13
+; CHECK-SD-NEXT: mov v4.s[3], v24.s[0]
+; CHECK-SD-NEXT: mov v5.s[3], v18.s[0]
+; CHECK-SD-NEXT: mov v6.s[3], v16.s[0]
+; CHECK-SD-NEXT: mov v7.s[3], v17.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v32i64_v32f32:
@@ -4728,42 +5132,242 @@ entry:
define <32 x float> @utofp_v32i64_v32f32(<32 x i64> %a) {
; CHECK-SD-LABEL: utofp_v32i64_v32f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldp q17, q16, [sp, #64]
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ldp q19, q18, [sp, #32]
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ldp q21, q20, [sp]
-; CHECK-SD-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: ldp q23, q22, [sp, #96]
-; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: ucvtf v19.2d, v19.2d
-; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT: ucvtf v24.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-SD-NEXT: ucvtf v23.2d, v23.2d
-; CHECK-SD-NEXT: ucvtf v25.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v4.2d
-; CHECK-SD-NEXT: ucvtf v26.2d, v5.2d
-; CHECK-SD-NEXT: fcvtn v3.2s, v6.2d
-; CHECK-SD-NEXT: ucvtf v27.2d, v7.2d
-; CHECK-SD-NEXT: ucvtf v20.2d, v20.2d
-; CHECK-SD-NEXT: fcvtn v5.2s, v19.2d
-; CHECK-SD-NEXT: ucvtf v18.2d, v18.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v21.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v17.2d
-; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT: fcvtn v7.2s, v23.2d
-; CHECK-SD-NEXT: ucvtf v17.2d, v22.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v24.2d
-; CHECK-SD-NEXT: fcvtn2 v1.4s, v25.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v26.2d
-; CHECK-SD-NEXT: fcvtn2 v3.4s, v27.2d
-; CHECK-SD-NEXT: fcvtn2 v5.4s, v18.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v20.2d
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v16.2d
-; CHECK-SD-NEXT: fcvtn2 v7.4s, v17.2d
+; CHECK-SD-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset b8, -8
+; CHECK-SD-NEXT: .cfi_offset b9, -16
+; CHECK-SD-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v17.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v19.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v18.2d, v1.2d, #32
+; CHECK-SD-NEXT: ushr v21.2d, v3.2d, #32
+; CHECK-SD-NEXT: ushr v28.2d, v4.2d, #32
+; CHECK-SD-NEXT: ushr v31.2d, v6.2d, #32
+; CHECK-SD-NEXT: mov x8, v17.d[1]
+; CHECK-SD-NEXT: fmov x13, d17
+; CHECK-SD-NEXT: mov x10, v19.d[1]
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT: and v20.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT: and v1.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT: mov x9, v18.d[1]
+; CHECK-SD-NEXT: fmov x14, d18
+; CHECK-SD-NEXT: and v23.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s2, x13
+; CHECK-SD-NEXT: and v24.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT: and v29.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT: mov x11, v0.d[1]
+; CHECK-SD-NEXT: fmov x13, d0
+; CHECK-SD-NEXT: scvtf s17, x8
+; CHECK-SD-NEXT: fmov x8, d19
+; CHECK-SD-NEXT: scvtf s25, x10
+; CHECK-SD-NEXT: fmov x10, d20
+; CHECK-SD-NEXT: mov x12, v1.d[1]
+; CHECK-SD-NEXT: scvtf s22, x9
+; CHECK-SD-NEXT: fmov x9, d23
+; CHECK-SD-NEXT: scvtf s0, x13
+; CHECK-SD-NEXT: scvtf s26, x14
+; CHECK-SD-NEXT: and v30.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: fmov x11, d1
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: scvtf s27, x10
+; CHECK-SD-NEXT: mov x10, v24.d[1]
+; CHECK-SD-NEXT: mov x8, v23.d[1]
+; CHECK-SD-NEXT: mov v2.s[1], v17.s[0]
+; CHECK-SD-NEXT: scvtf s23, x9
+; CHECK-SD-NEXT: mov x9, v21.d[1]
+; CHECK-SD-NEXT: scvtf s1, x11
+; CHECK-SD-NEXT: fmov x11, d21
+; CHECK-SD-NEXT: scvtf s19, x12
+; CHECK-SD-NEXT: mov v0.s[1], v18.s[0]
+; CHECK-SD-NEXT: ldp q18, q17, [sp, #112]
+; CHECK-SD-NEXT: mov v3.s[1], v25.s[0]
+; CHECK-SD-NEXT: mov x14, v20.d[1]
+; CHECK-SD-NEXT: fmov x13, d28
+; CHECK-SD-NEXT: scvtf s25, x11
+; CHECK-SD-NEXT: fmov x11, d24
+; CHECK-SD-NEXT: mov v2.s[2], v26.s[0]
+; CHECK-SD-NEXT: scvtf s26, x8
+; CHECK-SD-NEXT: mov x8, v29.d[1]
+; CHECK-SD-NEXT: fmov x12, d29
+; CHECK-SD-NEXT: mov v0.s[2], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x10
+; CHECK-SD-NEXT: mov x10, v28.d[1]
+; CHECK-SD-NEXT: scvtf s4, x11
+; CHECK-SD-NEXT: ushr v29.2d, v5.2d, #32
+; CHECK-SD-NEXT: mov v1.s[1], v19.s[0]
+; CHECK-SD-NEXT: ldp q21, q19, [sp, #80]
+; CHECK-SD-NEXT: scvtf s6, x13
+; CHECK-SD-NEXT: scvtf s20, x14
+; CHECK-SD-NEXT: mov x14, v31.d[1]
+; CHECK-SD-NEXT: scvtf s9, x8
+; CHECK-SD-NEXT: scvtf s28, x10
+; CHECK-SD-NEXT: fmov x8, d29
+; CHECK-SD-NEXT: mov x11, v30.d[1]
+; CHECK-SD-NEXT: mov v4.s[1], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x9
+; CHECK-SD-NEXT: fmov x9, d30
+; CHECK-SD-NEXT: mov v1.s[2], v23.s[0]
+; CHECK-SD-NEXT: ldp q24, q23, [sp, #16]
+; CHECK-SD-NEXT: mov v0.s[3], v20.s[0]
+; CHECK-SD-NEXT: scvtf s8, x14
+; CHECK-SD-NEXT: mov v3.s[2], v25.s[0]
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: fmov x9, d31
+; CHECK-SD-NEXT: and v31.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT: mov v6.s[1], v28.s[0]
+; CHECK-SD-NEXT: ushr v7.2d, v7.2d, #32
+; CHECK-SD-NEXT: scvtf s28, x8
+; CHECK-SD-NEXT: scvtf s30, x11
+; CHECK-SD-NEXT: scvtf s25, x12
+; CHECK-SD-NEXT: mov v1.s[3], v26.s[0]
+; CHECK-SD-NEXT: fmov x8, d31
+; CHECK-SD-NEXT: scvtf s20, x9
+; CHECK-SD-NEXT: mov x9, v29.d[1]
+; CHECK-SD-NEXT: fmov x11, d7
+; CHECK-SD-NEXT: mov x10, v31.d[1]
+; CHECK-SD-NEXT: mov v3.s[3], v27.s[0]
+; CHECK-SD-NEXT: mov v6.s[2], v28.s[0]
+; CHECK-SD-NEXT: mov v2.s[3], v22.s[0]
+; CHECK-SD-NEXT: scvtf s29, x8
+; CHECK-SD-NEXT: mov x8, v7.d[1]
+; CHECK-SD-NEXT: ushr v7.2d, v24.2d, #32
+; CHECK-SD-NEXT: and v24.16b, v24.16b, v16.16b
+; CHECK-SD-NEXT: mov v5.s[1], v30.s[0]
+; CHECK-SD-NEXT: scvtf s30, x11
+; CHECK-SD-NEXT: mov v20.s[1], v8.s[0]
+; CHECK-SD-NEXT: scvtf s26, x9
+; CHECK-SD-NEXT: mov v4.s[2], v25.s[0]
+; CHECK-SD-NEXT: ldp q31, q25, [sp, #48]
+; CHECK-SD-NEXT: mov x11, v7.d[1]
+; CHECK-SD-NEXT: mov x9, v24.d[1]
+; CHECK-SD-NEXT: fmov x12, d7
+; CHECK-SD-NEXT: scvtf s27, x10
+; CHECK-SD-NEXT: fmov x10, d24
+; CHECK-SD-NEXT: ushr v24.2d, v23.2d, #32
+; CHECK-SD-NEXT: mov v5.s[2], v29.s[0]
+; CHECK-SD-NEXT: mov v20.s[2], v30.s[0]
+; CHECK-SD-NEXT: ushr v29.2d, v31.2d, #32
+; CHECK-SD-NEXT: scvtf s30, x8
+; CHECK-SD-NEXT: scvtf s28, x11
+; CHECK-SD-NEXT: scvtf s7, x12
+; CHECK-SD-NEXT: and v31.16b, v31.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s8, x9
+; CHECK-SD-NEXT: scvtf s22, x10
+; CHECK-SD-NEXT: fmov x8, d24
+; CHECK-SD-NEXT: mov x9, v24.d[1]
+; CHECK-SD-NEXT: mov x10, v29.d[1]
+; CHECK-SD-NEXT: mov v5.s[3], v27.s[0]
+; CHECK-SD-NEXT: mov x11, v31.d[1]
+; CHECK-SD-NEXT: fmov x12, d29
+; CHECK-SD-NEXT: and v29.16b, v21.16b, v16.16b
+; CHECK-SD-NEXT: mov v7.s[1], v28.s[0]
+; CHECK-SD-NEXT: scvtf s27, x8
+; CHECK-SD-NEXT: and v23.16b, v23.16b, v16.16b
+; CHECK-SD-NEXT: mov v22.s[1], v8.s[0]
+; CHECK-SD-NEXT: ushr v8.2d, v25.2d, #32
+; CHECK-SD-NEXT: mov v20.s[3], v30.s[0]
+; CHECK-SD-NEXT: scvtf s30, x10
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: mov x10, v29.d[1]
+; CHECK-SD-NEXT: scvtf s28, x11
+; CHECK-SD-NEXT: fmov x12, d29
+; CHECK-SD-NEXT: ushr v29.2d, v18.2d, #32
+; CHECK-SD-NEXT: mov v7.s[2], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x9
+; CHECK-SD-NEXT: fmov x9, d31
+; CHECK-SD-NEXT: fmov x11, d8
+; CHECK-SD-NEXT: mov x8, v23.d[1]
+; CHECK-SD-NEXT: fmov x13, d23
+; CHECK-SD-NEXT: and v25.16b, v25.16b, v16.16b
+; CHECK-SD-NEXT: mov v24.s[1], v30.s[0]
+; CHECK-SD-NEXT: ushr v30.2d, v21.2d, #32
+; CHECK-SD-NEXT: scvtf s23, x9
+; CHECK-SD-NEXT: mov v6.s[3], v26.s[0]
+; CHECK-SD-NEXT: scvtf s21, x12
+; CHECK-SD-NEXT: scvtf s31, x11
+; CHECK-SD-NEXT: mov x11, v29.d[1]
+; CHECK-SD-NEXT: scvtf s26, x13
+; CHECK-SD-NEXT: mov x12, v25.d[1]
+; CHECK-SD-NEXT: fmov x13, d25
+; CHECK-SD-NEXT: ushr v25.2d, v19.2d, #32
+; CHECK-SD-NEXT: mov x9, v8.d[1]
+; CHECK-SD-NEXT: scvtf s8, x10
+; CHECK-SD-NEXT: mov x10, v30.d[1]
+; CHECK-SD-NEXT: mov v23.s[1], v28.s[0]
+; CHECK-SD-NEXT: fmov x14, d30
+; CHECK-SD-NEXT: and v18.16b, v18.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s28, x11
+; CHECK-SD-NEXT: fmov x11, d29
+; CHECK-SD-NEXT: mov v22.s[2], v26.s[0]
+; CHECK-SD-NEXT: mov v7.s[3], v27.s[0]
+; CHECK-SD-NEXT: and v19.16b, v19.16b, v16.16b
+; CHECK-SD-NEXT: and v16.16b, v17.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s26, x10
+; CHECK-SD-NEXT: scvtf s27, x14
+; CHECK-SD-NEXT: mov x14, v25.d[1]
+; CHECK-SD-NEXT: scvtf s29, x11
+; CHECK-SD-NEXT: fmov x11, d25
+; CHECK-SD-NEXT: ushr v25.2d, v17.2d, #32
+; CHECK-SD-NEXT: mov x10, v18.d[1]
+; CHECK-SD-NEXT: scvtf s17, x13
+; CHECK-SD-NEXT: fmov x13, d19
+; CHECK-SD-NEXT: mov v24.s[2], v31.s[0]
+; CHECK-SD-NEXT: mov v21.s[1], v8.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v9.s[0]
+; CHECK-SD-NEXT: scvtf s30, x11
+; CHECK-SD-NEXT: fmov x11, d18
+; CHECK-SD-NEXT: mov v27.s[1], v26.s[0]
+; CHECK-SD-NEXT: mov v29.s[1], v28.s[0]
+; CHECK-SD-NEXT: scvtf s28, x13
+; CHECK-SD-NEXT: fmov x13, d16
+; CHECK-SD-NEXT: scvtf s18, x10
+; CHECK-SD-NEXT: mov x10, v25.d[1]
+; CHECK-SD-NEXT: mov v23.s[2], v17.s[0]
+; CHECK-SD-NEXT: scvtf s26, x11
+; CHECK-SD-NEXT: fmov x11, d25
+; CHECK-SD-NEXT: scvtf s17, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v27.s[2], v30.s[0]
+; CHECK-SD-NEXT: mov v21.s[2], v28.s[0]
+; CHECK-SD-NEXT: scvtf s28, x12
+; CHECK-SD-NEXT: scvtf s25, x11
+; CHECK-SD-NEXT: mov x11, v19.d[1]
+; CHECK-SD-NEXT: scvtf s19, x9
+; CHECK-SD-NEXT: mov v26.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x14
+; CHECK-SD-NEXT: mov x9, v16.d[1]
+; CHECK-SD-NEXT: scvtf s16, x13
+; CHECK-SD-NEXT: mov v22.s[3], v17.s[0]
+; CHECK-SD-NEXT: mov v23.s[3], v28.s[0]
+; CHECK-SD-NEXT: mov v29.s[2], v25.s[0]
+; CHECK-SD-NEXT: scvtf s25, x10
+; CHECK-SD-NEXT: mov v24.s[3], v19.s[0]
+; CHECK-SD-NEXT: mov v27.s[3], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: dup v19.4s, w8
+; CHECK-SD-NEXT: mov v26.s[2], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x9
+; CHECK-SD-NEXT: mov v29.s[3], v25.s[0]
+; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v19.4s
+; CHECK-SD-NEXT: fmul v3.4s, v3.4s, v19.4s
+; CHECK-SD-NEXT: fmul v6.4s, v6.4s, v19.4s
+; CHECK-SD-NEXT: fmul v17.4s, v20.4s, v19.4s
+; CHECK-SD-NEXT: fmul v7.4s, v7.4s, v19.4s
+; CHECK-SD-NEXT: fmul v20.4s, v24.4s, v19.4s
+; CHECK-SD-NEXT: mov v21.s[3], v18.s[0]
+; CHECK-SD-NEXT: fmul v18.4s, v27.4s, v19.4s
+; CHECK-SD-NEXT: mov v26.s[3], v16.s[0]
+; CHECK-SD-NEXT: fmul v16.4s, v29.4s, v19.4s
+; CHECK-SD-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT: fadd v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT: fadd v2.4s, v6.4s, v4.4s
+; CHECK-SD-NEXT: fadd v3.4s, v17.4s, v5.4s
+; CHECK-SD-NEXT: fadd v4.4s, v7.4s, v22.4s
+; CHECK-SD-NEXT: fadd v5.4s, v20.4s, v23.4s
+; CHECK-SD-NEXT: fadd v6.4s, v18.4s, v21.4s
+; CHECK-SD-NEXT: fadd v7.4s, v16.4s, v26.4s
+; CHECK-SD-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v32i64_v32f32:
@@ -6268,14 +6872,19 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-SD-LABEL: stofp_v3i64_v3f16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: scvtf v1.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: scvtf s1, x8
+; CHECK-SD-NEXT: fmov x8, d2
+; CHECK-SD-NEXT: mov v3.s[0], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov v3.s[1], v1.s[0]
+; CHECK-SD-NEXT: mov v3.s[2], v0.s[0]
+; CHECK-SD-NEXT: fcvtn v0.4h, v3.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: stofp_v3i64_v3f16:
@@ -6318,11 +6927,38 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: ushr v5.2d, v2.2d, #32
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT: ucvtf v1.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v1.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: fmov x10, d3
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: scvtf s4, x10
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov x9, v5.d[1]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x8, d5
+; CHECK-SD-NEXT: mov v4.s[1], v3.s[0]
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v4.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v4.s[3], v1.s[0]
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT: fmul v0.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
@@ -6363,11 +6999,18 @@ entry:
define <4 x half> @stofp_v4i64_v4f16(<4 x i64> %a) {
; CHECK-SD-LABEL: stofp_v4i64_v4f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: fmov x9, d0
+; CHECK-SD-NEXT: scvtf s2, x9
+; CHECK-SD-NEXT: mov x9, v1.d[1]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT: fcvtn v0.4h, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: stofp_v4i64_v4f16:
@@ -6402,10 +7045,37 @@ entry:
define <4 x half> @utofp_v4i64_v4f16(<4 x i64> %a) {
; CHECK-SD-LABEL: utofp_v4i64_v4f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v4.2d, v1.2d, #32
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: fmov x10, d3
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SD-NEXT: scvtf s3, x10
+; CHECK-SD-NEXT: scvtf s5, x8
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x8, d4
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: mov x9, v4.d[1]
+; CHECK-SD-NEXT: mov v3.s[1], v5.s[0]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: mov v3.s[2], v4.s[0]
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v3.s[3], v1.s[0]
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: mov v2.s[3], v0.s[0]
+; CHECK-SD-NEXT: fmul v0.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
@@ -6441,16 +7111,30 @@ entry:
define <8 x half> @stofp_v8i64_v8f16(<8 x i64> %a) {
; CHECK-SD-LABEL: stofp_v8i64_v8f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: scvtf v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NEXT: mov x9, v0.d[1]
+; CHECK-SD-NEXT: fmov x10, d0
+; CHECK-SD-NEXT: mov x8, v2.d[1]
+; CHECK-SD-NEXT: scvtf s4, x10
+; CHECK-SD-NEXT: fmov x10, d1
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: fmov x9, d2
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: mov x8, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x9
+; CHECK-SD-NEXT: fmov x9, d3
+; CHECK-SD-NEXT: mov v4.s[1], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: mov x10, v3.d[1]
+; CHECK-SD-NEXT: scvtf s3, x9
+; CHECK-SD-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: mov v4.s[2], v0.s[0]
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: mov v1.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v2.s[0]
+; CHECK-SD-NEXT: mov v1.s[3], v0.s[0]
+; CHECK-SD-NEXT: fcvtn v0.4h, v4.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: stofp_v8i64_v8f16:
@@ -6501,16 +7185,67 @@ entry:
define <8 x half> @utofp_v8i64_v8f16(<8 x i64> %a) {
; CHECK-SD-LABEL: utofp_v8i64_v8f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-SD-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v6.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v5.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v7.2d, v1.2d, #32
+; CHECK-SD-NEXT: mov x9, v6.d[1]
+; CHECK-SD-NEXT: fmov x10, d6
+; CHECK-SD-NEXT: mov x8, v5.d[1]
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-SD-NEXT: scvtf s6, x10
+; CHECK-SD-NEXT: fmov x10, d5
+; CHECK-SD-NEXT: mov x11, v0.d[1]
+; CHECK-SD-NEXT: scvtf s17, x9
+; CHECK-SD-NEXT: fmov x9, d0
+; CHECK-SD-NEXT: scvtf s16, x8
+; CHECK-SD-NEXT: mov x8, v2.d[1]
+; CHECK-SD-NEXT: scvtf s5, x10
+; CHECK-SD-NEXT: mov x10, v7.d[1]
+; CHECK-SD-NEXT: scvtf s0, x9
+; CHECK-SD-NEXT: fmov x9, d2
+; CHECK-SD-NEXT: scvtf s2, x11
+; CHECK-SD-NEXT: fmov x11, d7
+; CHECK-SD-NEXT: ushr v7.2d, v3.2d, #32
+; CHECK-SD-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-SD-NEXT: mov v6.s[1], v17.s[0]
+; CHECK-SD-NEXT: mov v5.s[1], v16.s[0]
+; CHECK-SD-NEXT: scvtf s17, x9
+; CHECK-SD-NEXT: scvtf s16, x8
+; CHECK-SD-NEXT: scvtf s4, x11
+; CHECK-SD-NEXT: fmov x9, d7
+; CHECK-SD-NEXT: fmov x11, d1
+; CHECK-SD-NEXT: mov x8, v7.d[1]
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: scvtf s7, x9
+; CHECK-SD-NEXT: scvtf s2, x11
+; CHECK-SD-NEXT: mov x9, v1.d[1]
+; CHECK-SD-NEXT: mov v5.s[2], v4.s[0]
+; CHECK-SD-NEXT: scvtf s1, x10
+; CHECK-SD-NEXT: fmov x10, d3
+; CHECK-SD-NEXT: mov x11, v3.d[1]
+; CHECK-SD-NEXT: mov v17.s[1], v16.s[0]
+; CHECK-SD-NEXT: scvtf s4, x8
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: scvtf s3, x10
+; CHECK-SD-NEXT: mov v6.s[2], v7.s[0]
+; CHECK-SD-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-SD-NEXT: scvtf s2, x9
+; CHECK-SD-NEXT: mov v5.s[3], v1.s[0]
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: mov v17.s[2], v3.s[0]
+; CHECK-SD-NEXT: scvtf s3, x11
+; CHECK-SD-NEXT: mov v6.s[3], v4.s[0]
+; CHECK-SD-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-SD-NEXT: fmul v2.4s, v5.4s, v1.4s
+; CHECK-SD-NEXT: mov v17.s[3], v3.s[0]
+; CHECK-SD-NEXT: fmul v1.4s, v6.4s, v1.4s
+; CHECK-SD-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT: fadd v1.4s, v1.4s, v17.4s
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: utofp_v8i64_v8f16:
@@ -6561,26 +7296,54 @@ entry:
define <16 x half> @stofp_v16i64_v16f16(<16 x i64> %a) {
; CHECK-SD-LABEL: stofp_v16i64_v16f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: scvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: scvtf v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v7.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v3.2d
+; CHECK-SD-NEXT: fmov x12, d0
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: mov x9, v4.d[1]
+; CHECK-SD-NEXT: mov x10, v2.d[1]
+; CHECK-SD-NEXT: fmov x11, d2
+; CHECK-SD-NEXT: mov x13, v1.d[1]
+; CHECK-SD-NEXT: scvtf s0, x12
+; CHECK-SD-NEXT: fmov x12, d4
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: mov x8, v6.d[1]
+; CHECK-SD-NEXT: scvtf s16, x9
+; CHECK-SD-NEXT: fmov x9, d1
+; CHECK-SD-NEXT: scvtf s1, x10
+; CHECK-SD-NEXT: fmov x10, d5
+; CHECK-SD-NEXT: scvtf s4, x12
+; CHECK-SD-NEXT: fmov x12, d6
+; CHECK-SD-NEXT: scvtf s6, x11
+; CHECK-SD-NEXT: mov x11, v5.d[1]
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: scvtf s2, x10
+; CHECK-SD-NEXT: scvtf s17, x12
+; CHECK-SD-NEXT: fmov x9, d3
+; CHECK-SD-NEXT: fmov x12, d7
+; CHECK-SD-NEXT: mov v4.s[1], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x8
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: mov x10, v7.d[1]
+; CHECK-SD-NEXT: mov v6.s[1], v1.s[0]
+; CHECK-SD-NEXT: scvtf s7, x11
+; CHECK-SD-NEXT: scvtf s3, x9
+; CHECK-SD-NEXT: scvtf s1, x12
+; CHECK-SD-NEXT: mov v0.s[2], v5.s[0]
+; CHECK-SD-NEXT: scvtf s5, x13
+; CHECK-SD-NEXT: mov v17.s[1], v16.s[0]
+; CHECK-SD-NEXT: mov v4.s[2], v2.s[0]
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: mov v6.s[2], v3.s[0]
+; CHECK-SD-NEXT: mov v0.s[3], v5.s[0]
+; CHECK-SD-NEXT: mov v17.s[2], v1.s[0]
+; CHECK-SD-NEXT: scvtf s1, x10
+; CHECK-SD-NEXT: mov v4.s[3], v7.s[0]
+; CHECK-SD-NEXT: mov v6.s[3], v2.s[0]
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v1.2d
+; CHECK-SD-NEXT: mov v17.s[3], v1.s[0]
; CHECK-SD-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-SD-NEXT: fcvtn2 v1.8h, v6.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v6.4s
+; CHECK-SD-NEXT: fcvtn2 v1.8h, v17.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: stofp_v16i64_v16f16:
@@ -6664,26 +7427,125 @@ entry:
define <16 x half> @utofp_v16i64_v16f16(<16 x i64> %a) {
; CHECK-SD-LABEL: utofp_v16i64_v16f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: ucvtf v3.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-SD-NEXT: fcvtn v2.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v7.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-SD-NEXT: fcvtn2 v2.4s, v3.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v1.2d
+; CHECK-SD-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v17.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v18.2d, v3.2d, #32
+; CHECK-SD-NEXT: ushr v20.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v21.2d, v1.2d, #32
+; CHECK-SD-NEXT: ushr v19.2d, v6.2d, #32
+; CHECK-SD-NEXT: mov x8, v17.d[1]
+; CHECK-SD-NEXT: fmov x10, d17
+; CHECK-SD-NEXT: mov x9, v18.d[1]
+; CHECK-SD-NEXT: and v22.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT: and v23.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT: fmov x12, d18
+; CHECK-SD-NEXT: mov x13, v20.d[1]
+; CHECK-SD-NEXT: and v24.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT: and v17.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: fmov x15, d20
+; CHECK-SD-NEXT: mov x11, v21.d[1]
+; CHECK-SD-NEXT: mov x14, v22.d[1]
+; CHECK-SD-NEXT: scvtf s18, x8
+; CHECK-SD-NEXT: fmov x8, d22
+; CHECK-SD-NEXT: mov x10, v23.d[1]
+; CHECK-SD-NEXT: fmov x16, d23
+; CHECK-SD-NEXT: ushr v23.2d, v4.2d, #32
+; CHECK-SD-NEXT: scvtf s3, x15
+; CHECK-SD-NEXT: fmov x15, d21
+; CHECK-SD-NEXT: scvtf s21, x13
+; CHECK-SD-NEXT: scvtf s2, x8
+; CHECK-SD-NEXT: fmov x13, d24
+; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s20, x14
+; CHECK-SD-NEXT: mov v0.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x12
+; CHECK-SD-NEXT: fmov x12, d17
+; CHECK-SD-NEXT: mov x14, v19.d[1]
+; CHECK-SD-NEXT: scvtf s1, x16
+; CHECK-SD-NEXT: scvtf s22, x10
+; CHECK-SD-NEXT: mov x10, v17.d[1]
+; CHECK-SD-NEXT: scvtf s17, x13
+; CHECK-SD-NEXT: mov x13, v23.d[1]
+; CHECK-SD-NEXT: mov v3.s[1], v21.s[0]
+; CHECK-SD-NEXT: scvtf s21, x15
+; CHECK-SD-NEXT: mov v2.s[1], v20.s[0]
+; CHECK-SD-NEXT: scvtf s20, x12
+; CHECK-SD-NEXT: fmov x12, d19
+; CHECK-SD-NEXT: mov v0.s[2], v18.s[0]
+; CHECK-SD-NEXT: and v18.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s19, x14
+; CHECK-SD-NEXT: fmov x15, d23
+; CHECK-SD-NEXT: mov v1.s[1], v22.s[0]
+; CHECK-SD-NEXT: fmov x14, d6
+; CHECK-SD-NEXT: scvtf s4, x12
+; CHECK-SD-NEXT: mov x12, v6.d[1]
+; CHECK-SD-NEXT: scvtf s6, x13
+; CHECK-SD-NEXT: ushr v22.2d, v5.2d, #32
+; CHECK-SD-NEXT: fmov x13, d18
+; CHECK-SD-NEXT: mov v2.s[2], v20.s[0]
+; CHECK-SD-NEXT: mov v3.s[2], v21.s[0]
+; CHECK-SD-NEXT: scvtf s20, x11
+; CHECK-SD-NEXT: mov x11, v18.d[1]
+; CHECK-SD-NEXT: scvtf s21, x15
+; CHECK-SD-NEXT: and v5.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT: mov x8, v24.d[1]
+; CHECK-SD-NEXT: mov v4.s[1], v19.s[0]
+; CHECK-SD-NEXT: and v19.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s16, x13
+; CHECK-SD-NEXT: fmov x13, d22
+; CHECK-SD-NEXT: ushr v7.2d, v7.2d, #32
+; CHECK-SD-NEXT: mov v1.s[2], v17.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: mov x11, v22.d[1]
+; CHECK-SD-NEXT: fmov x15, d5
+; CHECK-SD-NEXT: mov v21.s[1], v6.s[0]
+; CHECK-SD-NEXT: scvtf s22, x12
+; CHECK-SD-NEXT: mov v3.s[3], v20.s[0]
+; CHECK-SD-NEXT: scvtf s6, x13
+; CHECK-SD-NEXT: fmov x13, d7
+; CHECK-SD-NEXT: mov x12, v7.d[1]
+; CHECK-SD-NEXT: scvtf s7, x14
+; CHECK-SD-NEXT: mov x14, v5.d[1]
+; CHECK-SD-NEXT: scvtf s20, x15
+; CHECK-SD-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x9
+; CHECK-SD-NEXT: mov x9, v19.d[1]
+; CHECK-SD-NEXT: scvtf s5, x13
+; CHECK-SD-NEXT: scvtf s17, x8
+; CHECK-SD-NEXT: mov v21.s[2], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x11
+; CHECK-SD-NEXT: fmov x11, d19
+; CHECK-SD-NEXT: scvtf s19, x10
+; CHECK-SD-NEXT: mov w10, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v7.s[1], v22.s[0]
+; CHECK-SD-NEXT: mov v16.s[2], v20.s[0]
+; CHECK-SD-NEXT: dup v20.4s, w10
+; CHECK-SD-NEXT: mov v0.s[3], v18.s[0]
+; CHECK-SD-NEXT: scvtf s22, x11
+; CHECK-SD-NEXT: mov v4.s[2], v5.s[0]
+; CHECK-SD-NEXT: scvtf s5, x12
+; CHECK-SD-NEXT: mov v21.s[3], v6.s[0]
+; CHECK-SD-NEXT: scvtf s6, x14
+; CHECK-SD-NEXT: mov v1.s[3], v17.s[0]
+; CHECK-SD-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-SD-NEXT: fmul v3.4s, v3.4s, v20.4s
+; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v20.4s
+; CHECK-SD-NEXT: mov v7.s[2], v22.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-SD-NEXT: scvtf s5, x9
+; CHECK-SD-NEXT: mov v16.s[3], v6.s[0]
+; CHECK-SD-NEXT: fmul v6.4s, v21.4s, v20.4s
+; CHECK-SD-NEXT: fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: mov v7.s[3], v5.s[0]
+; CHECK-SD-NEXT: fmul v3.4s, v4.4s, v20.4s
+; CHECK-SD-NEXT: fadd v5.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: fadd v4.4s, v6.4s, v16.4s
+; CHECK-SD-NEXT: fcvtn v0.4h, v2.4s
+; CHECK-SD-NEXT: fadd v2.4s, v3.4s, v7.4s
; CHECK-SD-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v2.4s
-; CHECK-SD-NEXT: fcvtn2 v1.8h, v6.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v5.4s
+; CHECK-SD-NEXT: fcvtn2 v1.8h, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: utofp_v16i64_v16f16:
@@ -6767,50 +7629,106 @@ entry:
define <32 x half> @stofp_v32i64_v32f16(<32 x i64> %a) {
; CHECK-SD-LABEL: stofp_v32i64_v32f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldp q17, q16, [sp]
-; CHECK-SD-NEXT: scvtf v18.2d, v0.2d
-; CHECK-SD-NEXT: ldp q19, q0, [sp, #64]
-; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: ldp q21, q20, [sp, #96]
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v17.2d, v17.2d
-; CHECK-SD-NEXT: ldp q23, q22, [sp, #32]
-; CHECK-SD-NEXT: scvtf v19.2d, v19.2d
-; CHECK-SD-NEXT: fcvtn v18.2s, v18.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-SD-NEXT: scvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: scvtf v23.2d, v23.2d
-; CHECK-SD-NEXT: scvtf v16.2d, v16.2d
-; CHECK-SD-NEXT: scvtf v21.2d, v21.2d
-; CHECK-SD-NEXT: fcvtn v17.2s, v17.2d
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v24.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn v19.2s, v19.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn2 v18.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v7.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-SD-NEXT: fcvtn v5.2s, v23.2d
-; CHECK-SD-NEXT: scvtf v3.2d, v22.2d
-; CHECK-SD-NEXT: fcvtn v7.2s, v21.2d
-; CHECK-SD-NEXT: fcvtn2 v17.4s, v16.2d
-; CHECK-SD-NEXT: scvtf v16.2d, v20.2d
-; CHECK-SD-NEXT: fcvtn2 v19.4s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v24.4s, v2.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v18.4s
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-SD-NEXT: fcvtn2 v5.4s, v3.2d
-; CHECK-SD-NEXT: fcvtn v2.4h, v17.4s
-; CHECK-SD-NEXT: fcvtn2 v7.4s, v16.2d
-; CHECK-SD-NEXT: fcvtn v3.4h, v19.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v24.4s
+; CHECK-SD-NEXT: fmov x10, d0
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: mov x11, v6.d[1]
+; CHECK-SD-NEXT: fmov x13, d1
+; CHECK-SD-NEXT: ldp q23, q18, [sp, #32]
+; CHECK-SD-NEXT: mov x9, v2.d[1]
+; CHECK-SD-NEXT: fmov x12, d3
+; CHECK-SD-NEXT: fmov x14, d7
+; CHECK-SD-NEXT: scvtf s0, x10
+; CHECK-SD-NEXT: ldp q21, q19, [sp]
+; CHECK-SD-NEXT: fmov x10, d2
+; CHECK-SD-NEXT: scvtf s22, x8
+; CHECK-SD-NEXT: scvtf s24, x11
+; CHECK-SD-NEXT: fmov x11, d4
+; CHECK-SD-NEXT: ldp q20, q17, [sp, #64]
+; CHECK-SD-NEXT: mov x8, v3.d[1]
+; CHECK-SD-NEXT: scvtf s3, x13
+; CHECK-SD-NEXT: fmov x13, d6
+; CHECK-SD-NEXT: scvtf s16, x10
+; CHECK-SD-NEXT: mov x10, v4.d[1]
+; CHECK-SD-NEXT: scvtf s2, x9
+; CHECK-SD-NEXT: mov v0.s[1], v22.s[0]
+; CHECK-SD-NEXT: mov x9, v1.d[1]
+; CHECK-SD-NEXT: scvtf s1, x11
+; CHECK-SD-NEXT: mov x11, v23.d[1]
+; CHECK-SD-NEXT: scvtf s6, x13
+; CHECK-SD-NEXT: fmov x13, d5
+; CHECK-SD-NEXT: scvtf s4, x10
+; CHECK-SD-NEXT: mov x10, v5.d[1]
+; CHECK-SD-NEXT: mov v16.s[1], v2.s[0]
+; CHECK-SD-NEXT: ldp q22, q2, [sp, #96]
+; CHECK-SD-NEXT: mov v0.s[2], v3.s[0]
+; CHECK-SD-NEXT: scvtf s3, x12
+; CHECK-SD-NEXT: mov x12, v21.d[1]
+; CHECK-SD-NEXT: scvtf s5, x13
+; CHECK-SD-NEXT: fmov x13, d23
+; CHECK-SD-NEXT: scvtf s23, x11
+; CHECK-SD-NEXT: mov x11, v20.d[1]
+; CHECK-SD-NEXT: mov v1.s[1], v4.s[0]
+; CHECK-SD-NEXT: mov v6.s[1], v24.s[0]
+; CHECK-SD-NEXT: scvtf s4, x13
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: fmov x12, d21
+; CHECK-SD-NEXT: scvtf s21, x9
+; CHECK-SD-NEXT: fmov x9, d20
+; CHECK-SD-NEXT: mov x13, v7.d[1]
+; CHECK-SD-NEXT: mov v1.s[2], v5.s[0]
+; CHECK-SD-NEXT: scvtf s5, x11
+; CHECK-SD-NEXT: fmov x11, d19
+; CHECK-SD-NEXT: scvtf s7, x12
+; CHECK-SD-NEXT: mov x12, v22.d[1]
+; CHECK-SD-NEXT: mov v16.s[2], v3.s[0]
+; CHECK-SD-NEXT: scvtf s20, x9
+; CHECK-SD-NEXT: mov v4.s[1], v23.s[0]
+; CHECK-SD-NEXT: scvtf s23, x14
+; CHECK-SD-NEXT: mov x9, v19.d[1]
+; CHECK-SD-NEXT: scvtf s19, x11
+; CHECK-SD-NEXT: fmov x11, d22
+; CHECK-SD-NEXT: fmov x14, d17
+; CHECK-SD-NEXT: scvtf s22, x10
+; CHECK-SD-NEXT: mov x10, v18.d[1]
+; CHECK-SD-NEXT: mov v7.s[1], v24.s[0]
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: mov x12, v17.d[1]
+; CHECK-SD-NEXT: scvtf s17, x11
+; CHECK-SD-NEXT: mov v20.s[1], v5.s[0]
+; CHECK-SD-NEXT: fmov x11, d18
+; CHECK-SD-NEXT: scvtf s5, x14
+; CHECK-SD-NEXT: fmov x14, d2
+; CHECK-SD-NEXT: scvtf s3, x13
+; CHECK-SD-NEXT: mov v0.s[3], v21.s[0]
+; CHECK-SD-NEXT: scvtf s21, x10
+; CHECK-SD-NEXT: mov v6.s[2], v23.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: mov x11, v2.d[1]
+; CHECK-SD-NEXT: scvtf s2, x9
+; CHECK-SD-NEXT: mov v7.s[2], v19.s[0]
+; CHECK-SD-NEXT: scvtf s19, x14
+; CHECK-SD-NEXT: mov v17.s[1], v24.s[0]
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: mov v20.s[2], v5.s[0]
+; CHECK-SD-NEXT: scvtf s5, x8
+; CHECK-SD-NEXT: mov v1.s[3], v22.s[0]
+; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
+; CHECK-SD-NEXT: mov v6.s[3], v3.s[0]
+; CHECK-SD-NEXT: mov v4.s[2], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: mov v7.s[3], v2.s[0]
+; CHECK-SD-NEXT: mov v17.s[2], v19.s[0]
+; CHECK-SD-NEXT: mov v20.s[3], v24.s[0]
+; CHECK-SD-NEXT: mov v16.s[3], v5.s[0]
+; CHECK-SD-NEXT: fcvtn v1.4h, v1.4s
+; CHECK-SD-NEXT: mov v4.s[3], v21.s[0]
+; CHECK-SD-NEXT: fcvtn v2.4h, v7.4s
+; CHECK-SD-NEXT: mov v17.s[3], v18.s[0]
+; CHECK-SD-NEXT: fcvtn v3.4h, v20.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v16.4s
; CHECK-SD-NEXT: fcvtn2 v1.8h, v6.4s
-; CHECK-SD-NEXT: fcvtn2 v2.8h, v5.4s
-; CHECK-SD-NEXT: fcvtn2 v3.8h, v7.4s
+; CHECK-SD-NEXT: fcvtn2 v2.8h, v4.4s
+; CHECK-SD-NEXT: fcvtn2 v3.8h, v17.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: stofp_v32i64_v32f16:
@@ -6970,50 +7888,253 @@ entry:
define <32 x half> @utofp_v32i64_v32f16(<32 x i64> %a) {
; CHECK-SD-LABEL: utofp_v32i64_v32f16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldp q17, q16, [sp]
-; CHECK-SD-NEXT: ucvtf v18.2d, v0.2d
-; CHECK-SD-NEXT: ldp q19, q0, [sp, #64]
-; CHECK-SD-NEXT: ucvtf v4.2d, v4.2d
-; CHECK-SD-NEXT: ldp q21, q20, [sp, #96]
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT: ldp q23, q22, [sp, #32]
-; CHECK-SD-NEXT: ucvtf v19.2d, v19.2d
-; CHECK-SD-NEXT: fcvtn v18.2s, v18.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: fcvtn v4.2s, v4.2d
-; CHECK-SD-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: ucvtf v23.2d, v23.2d
-; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT: ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT: fcvtn v17.2s, v17.2d
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: fcvtn v24.2s, v2.2d
-; CHECK-SD-NEXT: fcvtn v19.2s, v19.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: fcvtn2 v18.4s, v1.2d
-; CHECK-SD-NEXT: fcvtn v6.2s, v6.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v7.2d
-; CHECK-SD-NEXT: fcvtn2 v4.4s, v5.2d
-; CHECK-SD-NEXT: fcvtn v5.2s, v23.2d
-; CHECK-SD-NEXT: ucvtf v3.2d, v22.2d
-; CHECK-SD-NEXT: fcvtn v7.2s, v21.2d
-; CHECK-SD-NEXT: fcvtn2 v17.4s, v16.2d
-; CHECK-SD-NEXT: ucvtf v16.2d, v20.2d
-; CHECK-SD-NEXT: fcvtn2 v19.4s, v0.2d
-; CHECK-SD-NEXT: fcvtn2 v24.4s, v2.2d
-; CHECK-SD-NEXT: fcvtn v0.4h, v18.4s
-; CHECK-SD-NEXT: fcvtn2 v6.4s, v1.2d
+; CHECK-SD-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: .cfi_offset b8, -8
+; CHECK-SD-NEXT: .cfi_offset b9, -16
+; CHECK-SD-NEXT: .cfi_offset b10, -32
+; CHECK-SD-NEXT: movi v16.2d, #0x000000ffffffff
+; CHECK-SD-NEXT: ushr v18.2d, v3.2d, #32
+; CHECK-SD-NEXT: ushr v17.2d, v2.2d, #32
+; CHECK-SD-NEXT: ushr v20.2d, v0.2d, #32
+; CHECK-SD-NEXT: ushr v19.2d, v1.2d, #32
+; CHECK-SD-NEXT: ushr v23.2d, v6.2d, #32
+; CHECK-SD-NEXT: ushr v31.2d, v7.2d, #32
+; CHECK-SD-NEXT: fmov x14, d18
+; CHECK-SD-NEXT: mov x8, v17.d[1]
+; CHECK-SD-NEXT: fmov x13, d17
+; CHECK-SD-NEXT: and v21.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT: and v1.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT: and v22.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT: mov x10, v18.d[1]
+; CHECK-SD-NEXT: mov x11, v20.d[1]
+; CHECK-SD-NEXT: and v24.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s17, x14
+; CHECK-SD-NEXT: scvtf s0, x13
+; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT: mov x12, v21.d[1]
+; CHECK-SD-NEXT: fmov x14, d21
+; CHECK-SD-NEXT: scvtf s18, x8
+; CHECK-SD-NEXT: fmov x8, d20
+; CHECK-SD-NEXT: fmov x15, d1
+; CHECK-SD-NEXT: mov x13, v22.d[1]
+; CHECK-SD-NEXT: scvtf s27, x10
+; CHECK-SD-NEXT: mov x10, v23.d[1]
+; CHECK-SD-NEXT: and v29.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s2, x14
+; CHECK-SD-NEXT: fmov x14, d22
+; CHECK-SD-NEXT: scvtf s22, x11
+; CHECK-SD-NEXT: scvtf s20, x12
+; CHECK-SD-NEXT: mov x12, v1.d[1]
+; CHECK-SD-NEXT: scvtf s21, x15
+; CHECK-SD-NEXT: mov v0.s[1], v18.s[0]
+; CHECK-SD-NEXT: fmov x11, d19
+; CHECK-SD-NEXT: scvtf s25, x13
+; CHECK-SD-NEXT: fmov x13, d24
+; CHECK-SD-NEXT: scvtf s3, x8
+; CHECK-SD-NEXT: mov x8, v24.d[1]
+; CHECK-SD-NEXT: scvtf s1, x14
+; CHECK-SD-NEXT: mov x9, v19.d[1]
+; CHECK-SD-NEXT: ldp q19, q18, [sp, #96]
+; CHECK-SD-NEXT: mov v2.s[1], v20.s[0]
+; CHECK-SD-NEXT: scvtf s28, x12
+; CHECK-SD-NEXT: mov v0.s[2], v17.s[0]
+; CHECK-SD-NEXT: scvtf s24, x11
+; CHECK-SD-NEXT: mov x11, v6.d[1]
+; CHECK-SD-NEXT: fmov x12, d6
+; CHECK-SD-NEXT: scvtf s26, x13
+; CHECK-SD-NEXT: fmov x13, d29
+; CHECK-SD-NEXT: mov v1.s[1], v25.s[0]
+; CHECK-SD-NEXT: mov v3.s[1], v22.s[0]
+; CHECK-SD-NEXT: mov v2.s[2], v21.s[0]
+; CHECK-SD-NEXT: ldp q21, q20, [sp, #128]
+; CHECK-SD-NEXT: mov v0.s[3], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x10
+; CHECK-SD-NEXT: fmov x10, d23
+; CHECK-SD-NEXT: ldp q25, q23, [sp, #64]
+; CHECK-SD-NEXT: scvtf s6, x12
+; CHECK-SD-NEXT: scvtf s30, x11
+; CHECK-SD-NEXT: mov x11, v29.d[1]
+; CHECK-SD-NEXT: and v29.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT: mov v2.s[3], v28.s[0]
+; CHECK-SD-NEXT: ushr v28.2d, v4.2d, #32
+; CHECK-SD-NEXT: scvtf s17, x10
+; CHECK-SD-NEXT: scvtf s4, x13
+; CHECK-SD-NEXT: mov v1.s[2], v26.s[0]
+; CHECK-SD-NEXT: and v26.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT: mov v3.s[2], v24.s[0]
+; CHECK-SD-NEXT: ldp q22, q24, [sp, #32]
+; CHECK-SD-NEXT: mov x12, v28.d[1]
+; CHECK-SD-NEXT: fmov x13, d28
+; CHECK-SD-NEXT: ushr v28.2d, v5.2d, #32
+; CHECK-SD-NEXT: mov v6.s[1], v30.s[0]
+; CHECK-SD-NEXT: mov v17.s[1], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x11
+; CHECK-SD-NEXT: fmov x11, d31
+; CHECK-SD-NEXT: fmov x10, d26
+; CHECK-SD-NEXT: scvtf s7, x9
+; CHECK-SD-NEXT: scvtf s5, x13
+; CHECK-SD-NEXT: fmov x13, d28
+; CHECK-SD-NEXT: mov x9, v26.d[1]
+; CHECK-SD-NEXT: scvtf s30, x12
+; CHECK-SD-NEXT: fmov x12, d29
+; CHECK-SD-NEXT: and v9.16b, v23.16b, v16.16b
+; CHECK-SD-NEXT: mov v4.s[1], v27.s[0]
+; CHECK-SD-NEXT: scvtf s8, x11
+; CHECK-SD-NEXT: mov x11, v29.d[1]
+; CHECK-SD-NEXT: and v29.16b, v22.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s26, x10
+; CHECK-SD-NEXT: mov x10, v31.d[1]
+; CHECK-SD-NEXT: scvtf s27, x12
+; CHECK-SD-NEXT: mov x12, v28.d[1]
+; CHECK-SD-NEXT: scvtf s28, x13
+; CHECK-SD-NEXT: mov v5.s[1], v30.s[0]
+; CHECK-SD-NEXT: ushr v31.2d, v25.2d, #32
+; CHECK-SD-NEXT: and v25.16b, v25.16b, v16.16b
+; CHECK-SD-NEXT: mov v3.s[3], v7.s[0]
+; CHECK-SD-NEXT: mov v17.s[2], v8.s[0]
+; CHECK-SD-NEXT: ushr v10.2d, v24.2d, #32
+; CHECK-SD-NEXT: mov v6.s[2], v26.s[0]
+; CHECK-SD-NEXT: scvtf s26, x10
+; CHECK-SD-NEXT: mov x14, v31.d[1]
+; CHECK-SD-NEXT: mov x10, v25.d[1]
+; CHECK-SD-NEXT: fmov x13, d31
+; CHECK-SD-NEXT: mov v5.s[2], v28.s[0]
+; CHECK-SD-NEXT: scvtf s28, x12
+; CHECK-SD-NEXT: fmov x12, d29
+; CHECK-SD-NEXT: mov v4.s[2], v27.s[0]
+; CHECK-SD-NEXT: scvtf s27, x11
+; CHECK-SD-NEXT: fmov x11, d25
+; CHECK-SD-NEXT: ushr v25.2d, v22.2d, #32
+; CHECK-SD-NEXT: ushr v31.2d, v23.2d, #32
+; CHECK-SD-NEXT: scvtf s7, x13
+; CHECK-SD-NEXT: scvtf s23, x12
+; CHECK-SD-NEXT: fmov x12, d9
+; CHECK-SD-NEXT: scvtf s30, x14
+; CHECK-SD-NEXT: mov x13, v29.d[1]
+; CHECK-SD-NEXT: scvtf s8, x10
+; CHECK-SD-NEXT: scvtf s22, x11
+; CHECK-SD-NEXT: mov x14, v25.d[1]
+; CHECK-SD-NEXT: fmov x10, d31
+; CHECK-SD-NEXT: mov x11, v31.d[1]
+; CHECK-SD-NEXT: scvtf s29, x12
+; CHECK-SD-NEXT: fmov x12, d25
+; CHECK-SD-NEXT: mov v5.s[3], v28.s[0]
+; CHECK-SD-NEXT: mov v7.s[1], v30.s[0]
+; CHECK-SD-NEXT: mov v17.s[3], v26.s[0]
+; CHECK-SD-NEXT: mov v4.s[3], v27.s[0]
+; CHECK-SD-NEXT: scvtf s30, x10
+; CHECK-SD-NEXT: mov x10, v9.d[1]
+; CHECK-SD-NEXT: mov v22.s[1], v8.s[0]
+; CHECK-SD-NEXT: scvtf s9, x14
+; CHECK-SD-NEXT: scvtf s25, x12
+; CHECK-SD-NEXT: and v8.16b, v24.16b, v16.16b
+; CHECK-SD-NEXT: scvtf s31, x13
+; CHECK-SD-NEXT: fmov x13, d10
+; CHECK-SD-NEXT: scvtf s24, x11
+; CHECK-SD-NEXT: and v27.16b, v20.16b, v16.16b
+; CHECK-SD-NEXT: ushr v20.2d, v20.2d, #32
+; CHECK-SD-NEXT: fmov x12, d8
+; CHECK-SD-NEXT: mov v7.s[2], v30.s[0]
+; CHECK-SD-NEXT: and v30.16b, v21.16b, v16.16b
+; CHECK-SD-NEXT: mov v25.s[1], v9.s[0]
+; CHECK-SD-NEXT: ushr v9.2d, v19.2d, #32
+; CHECK-SD-NEXT: ushr v21.2d, v21.2d, #32
+; CHECK-SD-NEXT: mov v23.s[1], v31.s[0]
+; CHECK-SD-NEXT: mov x11, v8.d[1]
+; CHECK-SD-NEXT: scvtf s8, x13
+; CHECK-SD-NEXT: scvtf s31, x12
+; CHECK-SD-NEXT: mov x12, v10.d[1]
+; CHECK-SD-NEXT: mov x13, v30.d[1]
+; CHECK-SD-NEXT: mov x14, v9.d[1]
+; CHECK-SD-NEXT: and v19.16b, v19.16b, v16.16b
+; CHECK-SD-NEXT: mov x15, v21.d[1]
+; CHECK-SD-NEXT: fmov x16, d9
+; CHECK-SD-NEXT: mov v7.s[3], v24.s[0]
+; CHECK-SD-NEXT: mov v22.s[2], v29.s[0]
+; CHECK-SD-NEXT: and v16.16b, v18.16b, v16.16b
+; CHECK-SD-NEXT: mov v25.s[2], v8.s[0]
+; CHECK-SD-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: scvtf s24, x12
+; CHECK-SD-NEXT: fmov x12, d30
+; CHECK-SD-NEXT: scvtf s28, x14
+; CHECK-SD-NEXT: ushr v30.2d, v18.2d, #32
+; CHECK-SD-NEXT: fmov x14, d21
+; CHECK-SD-NEXT: scvtf s26, x13
+; CHECK-SD-NEXT: mov x13, v19.d[1]
+; CHECK-SD-NEXT: scvtf s29, x16
+; CHECK-SD-NEXT: scvtf s21, x15
+; CHECK-SD-NEXT: fmov x15, d19
+; CHECK-SD-NEXT: fmov x16, d16
+; CHECK-SD-NEXT: mov v23.s[2], v31.s[0]
+; CHECK-SD-NEXT: mov v25.s[3], v24.s[0]
+; CHECK-SD-NEXT: scvtf s19, x14
+; CHECK-SD-NEXT: fmov x14, d30
+; CHECK-SD-NEXT: scvtf s24, x9
+; CHECK-SD-NEXT: scvtf s18, x13
+; CHECK-SD-NEXT: mov x13, v30.d[1]
+; CHECK-SD-NEXT: scvtf s30, x15
+; CHECK-SD-NEXT: mov v29.s[1], v28.s[0]
+; CHECK-SD-NEXT: fmov x15, d20
+; CHECK-SD-NEXT: scvtf s28, x14
+; CHECK-SD-NEXT: mov x14, v20.d[1]
+; CHECK-SD-NEXT: scvtf s20, x12
+; CHECK-SD-NEXT: mov x12, v16.d[1]
+; CHECK-SD-NEXT: mov v19.s[1], v21.s[0]
+; CHECK-SD-NEXT: scvtf s21, x16
+; CHECK-SD-NEXT: scvtf s16, x15
+; CHECK-SD-NEXT: mov w15, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: mov v30.s[1], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x13
+; CHECK-SD-NEXT: fmov x13, d27
+; CHECK-SD-NEXT: mov v6.s[3], v24.s[0]
+; CHECK-SD-NEXT: mov v29.s[2], v28.s[0]
+; CHECK-SD-NEXT: scvtf s28, x11
+; CHECK-SD-NEXT: mov x11, v27.d[1]
+; CHECK-SD-NEXT: mov v20.s[1], v26.s[0]
+; CHECK-SD-NEXT: dup v27.4s, w15
+; CHECK-SD-NEXT: scvtf s26, x13
+; CHECK-SD-NEXT: mov v19.s[2], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x14
+; CHECK-SD-NEXT: mov v30.s[2], v21.s[0]
+; CHECK-SD-NEXT: scvtf s21, x8
+; CHECK-SD-NEXT: mov v29.s[3], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x12
+; CHECK-SD-NEXT: mov v23.s[3], v28.s[0]
+; CHECK-SD-NEXT: fmul v3.4s, v3.4s, v27.4s
+; CHECK-SD-NEXT: fmul v5.4s, v5.4s, v27.4s
+; CHECK-SD-NEXT: fmul v25.4s, v25.4s, v27.4s
+; CHECK-SD-NEXT: mov v20.s[2], v26.s[0]
+; CHECK-SD-NEXT: mov v19.s[3], v16.s[0]
+; CHECK-SD-NEXT: scvtf s16, x10
+; CHECK-SD-NEXT: mov v1.s[3], v21.s[0]
+; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v27.4s
+; CHECK-SD-NEXT: mov v30.s[3], v18.s[0]
+; CHECK-SD-NEXT: scvtf s18, x11
+; CHECK-SD-NEXT: fmul v26.4s, v29.4s, v27.4s
+; CHECK-SD-NEXT: fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: fmul v3.4s, v17.4s, v27.4s
+; CHECK-SD-NEXT: fadd v4.4s, v5.4s, v4.4s
+; CHECK-SD-NEXT: mov v22.s[3], v16.s[0]
+; CHECK-SD-NEXT: fmul v5.4s, v7.4s, v27.4s
+; CHECK-SD-NEXT: fadd v7.4s, v25.4s, v23.4s
+; CHECK-SD-NEXT: fmul v16.4s, v19.4s, v27.4s
+; CHECK-SD-NEXT: mov v20.s[3], v18.s[0]
+; CHECK-SD-NEXT: fadd v17.4s, v26.4s, v30.4s
+; CHECK-SD-NEXT: fadd v18.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: fcvtn v0.4h, v2.4s
+; CHECK-SD-NEXT: fadd v6.4s, v3.4s, v6.4s
; CHECK-SD-NEXT: fcvtn v1.4h, v4.4s
-; CHECK-SD-NEXT: fcvtn2 v5.4s, v3.2d
-; CHECK-SD-NEXT: fcvtn v2.4h, v17.4s
-; CHECK-SD-NEXT: fcvtn2 v7.4s, v16.2d
-; CHECK-SD-NEXT: fcvtn v3.4h, v19.4s
-; CHECK-SD-NEXT: fcvtn2 v0.8h, v24.4s
+; CHECK-SD-NEXT: fadd v4.4s, v5.4s, v22.4s
+; CHECK-SD-NEXT: fcvtn v2.4h, v7.4s
+; CHECK-SD-NEXT: fadd v5.4s, v16.4s, v20.4s
+; CHECK-SD-NEXT: fcvtn v3.4h, v17.4s
+; CHECK-SD-NEXT: fcvtn2 v0.8h, v18.4s
; CHECK-SD-NEXT: fcvtn2 v1.8h, v6.4s
-; CHECK-SD-NEXT: fcvtn2 v2.8h, v5.4s
-; CHECK-SD-NEXT: fcvtn2 v3.8h, v7.4s
+; CHECK-SD-NEXT: fcvtn2 v2.8h, v4.4s
+; CHECK-SD-NEXT: fcvtn2 v3.8h, v5.4s
+; CHECK-SD-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
; CHECK-SD-NEXT: ret
;
; CHECK-GI-NOFP16-LABEL: utofp_v32i64_v32f16:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 573fe3d8b8a77..ecc2e423c6e42 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -722,8 +722,23 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v1i64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v2.2d, v0.2d, #32
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NEXT: fadd v0.2s, v2.2s, v1.2s
; CHECK-NEXT: ret
%res = uitofp <1 x i64> %op1 to <1 x float>
ret <1 x float> %res
@@ -733,8 +748,23 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: ucvtf_v2i64_v2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v2.2d, v0.2d, #32
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: fmov x9, d2
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s
+; CHECK-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NEXT: fadd v0.2s, v2.2s, v1.2s
; CHECK-NEXT: ret
%res = uitofp <2 x i64> %op1 to <2 x float>
ret <2 x float> %res
@@ -1646,8 +1676,11 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v1i64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ret
%res = sitofp <1 x i64> %op1 to <1 x float>
ret <1 x float> %res
@@ -1657,8 +1690,12 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: scvtf_v2i64_v2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: scvtf s1, x8
+; CHECK-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%res = sitofp <2 x i64> %op1 to <2 x float>
ret <2 x float> %res
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index 8f38bdbedc629..610e9e90ed160 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -87,14 +87,29 @@ define <8 x float> @sitofp_i32_float(<8 x i32> %a) {
define <8 x float> @sitofp_i64_float(<8 x i64> %a) {
; CHECK-LABEL: sitofp_i64_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v4.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-NEXT: scvtf v2.2d, v3.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT: fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: mov x9, v2.d[1]
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: fmov x11, d2
+; CHECK-NEXT: scvtf s0, x10
+; CHECK-NEXT: mov x10, v3.d[1]
+; CHECK-NEXT: scvtf s4, x8
+; CHECK-NEXT: scvtf s5, x9
+; CHECK-NEXT: scvtf s2, x11
+; CHECK-NEXT: fmov x9, d1
+; CHECK-NEXT: fmov x11, d3
+; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v0.s[1], v4.s[0]
+; CHECK-NEXT: scvtf s3, x11
+; CHECK-NEXT: mov v2.s[1], v5.s[0]
+; CHECK-NEXT: scvtf s4, x8
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-NEXT: mov v0.s[3], v4.s[0]
+; CHECK-NEXT: mov v2.s[3], v1.s[0]
+; CHECK-NEXT: mov v1.16b, v2.16b
; CHECK-NEXT: ret
%1 = sitofp <8 x i64> %a to <8 x float>
ret <8 x float> %1
@@ -177,14 +192,65 @@ define <8 x float> @uitofp_i32_float(<8 x i32> %a) {
define <8 x float> @uitofp_i64_float(<8 x i64> %a) {
; CHECK-LABEL: uitofp_i64_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-NEXT: ucvtf v4.2d, v1.2d
-; CHECK-NEXT: fcvtn v0.2s, v0.2d
-; CHECK-NEXT: fcvtn v1.2s, v2.2d
-; CHECK-NEXT: ucvtf v2.2d, v3.2d
-; CHECK-NEXT: fcvtn2 v0.4s, v4.2d
-; CHECK-NEXT: fcvtn2 v1.4s, v2.2d
+; CHECK-NEXT: movi v4.2d, #0x000000ffffffff
+; CHECK-NEXT: ushr v5.2d, v0.2d, #32
+; CHECK-NEXT: ushr v6.2d, v2.2d, #32
+; CHECK-NEXT: ushr v7.2d, v1.2d, #32
+; CHECK-NEXT: ushr v16.2d, v3.2d, #32
+; CHECK-NEXT: mov x8, v5.d[1]
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: fmov x10, d5
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: fmov x13, d6
+; CHECK-NEXT: fmov x12, d7
+; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
+; CHECK-NEXT: mov x11, v7.d[1]
+; CHECK-NEXT: scvtf s5, x10
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: scvtf s6, x8
+; CHECK-NEXT: mov x8, v2.d[1]
+; CHECK-NEXT: scvtf s4, x13
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: fmov x9, d16
+; CHECK-NEXT: scvtf s17, x12
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: fmov x13, d2
+; CHECK-NEXT: scvtf s2, x10
+; CHECK-NEXT: mov v5.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s6, x8
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: scvtf s18, x13
+; CHECK-NEXT: mov x8, v16.d[1]
+; CHECK-NEXT: mov v4.s[1], v7.s[0]
+; CHECK-NEXT: scvtf s7, x9
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: fmov x13, d3
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: mov x12, v3.d[1]
+; CHECK-NEXT: mov v5.s[2], v17.s[0]
+; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov v18.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s2, x11
+; CHECK-NEXT: scvtf s3, x13
+; CHECK-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-NEXT: scvtf s6, x8
+; CHECK-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v5.s[3], v2.s[0]
+; CHECK-NEXT: scvtf s2, x12
+; CHECK-NEXT: mov v18.s[2], v3.s[0]
+; CHECK-NEXT: mov v4.s[3], v6.s[0]
+; CHECK-NEXT: dup v3.4s, w8
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: fmul v1.4s, v5.4s, v3.4s
+; CHECK-NEXT: mov v18.s[3], v2.s[0]
+; CHECK-NEXT: fmul v2.4s, v4.4s, v3.4s
+; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: fadd v1.4s, v2.4s, v18.4s
; CHECK-NEXT: ret
%1 = uitofp <8 x i64> %a to <8 x float>
ret <8 x float> %1
>From e8ec005a7369ac94d141e993eb4ec4cc8fc17bd7 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka at google.com>
Date: Tue, 18 Mar 2025 05:05:57 +0000
Subject: [PATCH 5/5] modify
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b9a97d381f1b8..be305c0b18c35 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5095,7 +5095,7 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
- if (VT.isVector() && VT.getVectorElementType().getFixedSizeInBits() == 32) {
+ if (VT.getVectorElementType() != MVT::f16) {
// Due to the absence of any vector instructions to directly convert
// larger fixed point to lower floating point, we end up using intermediate
// representation before finally getting VTSize-d floating point. This extra
More information about the llvm-commits
mailing list