[llvm] edc1c3d - [AArch64] Make more vector f16 operations legal

David Majnemer via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 8 12:00:03 PST 2024


Author: David Majnemer
Date: 2024-03-08T19:52:54Z
New Revision: edc1c3d24e6f8ed548340ce0369138fb40427a24

URL: https://github.com/llvm/llvm-project/commit/edc1c3d24e6f8ed548340ce0369138fb40427a24
DIFF: https://github.com/llvm/llvm-project/commit/edc1c3d24e6f8ed548340ce0369138fb40427a24.diff

LOG: [AArch64] Make more vector f16 operations legal

v8f16 is a legal type but promoting to v16f16 would result in an illegal
type.

Let's legalize these by a combination of splitting+promoting resulting
in a pair of v4f16.

Also, we were being overly cautious with different v4f16 nodes. Mark
more of them safe to promote to v4f32.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/faddp-half.ll
    llvm/test/CodeGen/AArch64/faddsub.ll
    llvm/test/CodeGen/AArch64/fcvt.ll
    llvm/test/CodeGen/AArch64/fcvt_combine.ll
    llvm/test/CodeGen/AArch64/fdiv.ll
    llvm/test/CodeGen/AArch64/fmla.ll
    llvm/test/CodeGen/AArch64/fmul.ll
    llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
    llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 89b697b2d51528..054311d39e7b83 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -701,43 +701,45 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   auto LegalizeNarrowFP = [this](MVT ScalarVT) {
-    for (auto Op : {ISD::SETCC,
-                    ISD::SELECT_CC,
-                    ISD::BR_CC,
-                    ISD::FADD,
-                    ISD::FSUB,
-                    ISD::FMUL,
-                    ISD::FDIV,
-                    ISD::FMA,
-                    ISD::FCEIL,
-                    ISD::FSQRT,
-                    ISD::FFLOOR,
-                    ISD::FNEARBYINT,
-                    ISD::FRINT,
-                    ISD::FROUND,
-                    ISD::FROUNDEVEN,
-                    ISD::FTRUNC,
-                    ISD::FMINNUM,
-                    ISD::FMAXNUM,
-                    ISD::FMINIMUM,
-                    ISD::FMAXIMUM,
-                    ISD::STRICT_FADD,
-                    ISD::STRICT_FSUB,
-                    ISD::STRICT_FMUL,
-                    ISD::STRICT_FDIV,
-                    ISD::STRICT_FMA,
-                    ISD::STRICT_FCEIL,
-                    ISD::STRICT_FFLOOR,
-                    ISD::STRICT_FSQRT,
-                    ISD::STRICT_FRINT,
-                    ISD::STRICT_FNEARBYINT,
-                    ISD::STRICT_FROUND,
-                    ISD::STRICT_FTRUNC,
-                    ISD::STRICT_FROUNDEVEN,
-                    ISD::STRICT_FMINNUM,
-                    ISD::STRICT_FMAXNUM,
-                    ISD::STRICT_FMINIMUM,
-                    ISD::STRICT_FMAXIMUM})
+    for (auto Op : {
+             ISD::SETCC,
+             ISD::SELECT_CC,
+             ISD::BR_CC,
+             ISD::FADD,
+             ISD::FSUB,
+             ISD::FMUL,
+             ISD::FDIV,
+             ISD::FMA,
+             ISD::FCEIL,
+             ISD::FSQRT,
+             ISD::FFLOOR,
+             ISD::FNEARBYINT,
+             ISD::FRINT,
+             ISD::FROUND,
+             ISD::FROUNDEVEN,
+             ISD::FTRUNC,
+             ISD::FMINNUM,
+             ISD::FMAXNUM,
+             ISD::FMINIMUM,
+             ISD::FMAXIMUM,
+             ISD::STRICT_FADD,
+             ISD::STRICT_FSUB,
+             ISD::STRICT_FMUL,
+             ISD::STRICT_FDIV,
+             ISD::STRICT_FMA,
+             ISD::STRICT_FCEIL,
+             ISD::STRICT_FFLOOR,
+             ISD::STRICT_FSQRT,
+             ISD::STRICT_FRINT,
+             ISD::STRICT_FNEARBYINT,
+             ISD::STRICT_FROUND,
+             ISD::STRICT_FTRUNC,
+             ISD::STRICT_FROUNDEVEN,
+             ISD::STRICT_FMINNUM,
+             ISD::STRICT_FMAXNUM,
+             ISD::STRICT_FMINIMUM,
+             ISD::STRICT_FMAXIMUM,
+         })
       setOperationAction(Op, ScalarVT, Promote);
 
     for (auto Op : {ISD::FNEG, ISD::FABS})
@@ -752,45 +754,45 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // promote v4f16 to v4f32 when that is known to be safe.
     auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
-    setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
-
-    setOperationAction(ISD::FABS, V4Narrow, Legal);
-    setOperationAction(ISD::FNEG, V4Narrow, Legal);
-    setOperationAction(ISD::FROUND,      V4Narrow, Expand);
-    setOperationAction(ISD::FROUNDEVEN,  V4Narrow, Expand);
+    setOperationPromotedToType(ISD::FADD,       V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FSUB,       V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FMUL,       V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FDIV,       V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FCEIL,      V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FFLOOR,     V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FROUND,     V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FTRUNC,     V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
+
+    setOperationAction(ISD::FABS,        V4Narrow, Legal);
+    setOperationAction(ISD::FNEG, 	 V4Narrow, Legal);
     setOperationAction(ISD::FMA,         V4Narrow, Expand);
     setOperationAction(ISD::SETCC,       V4Narrow, Custom);
     setOperationAction(ISD::BR_CC,       V4Narrow, Expand);
     setOperationAction(ISD::SELECT,      V4Narrow, Expand);
     setOperationAction(ISD::SELECT_CC,   V4Narrow, Expand);
-    setOperationAction(ISD::FTRUNC,      V4Narrow, Expand);
-    setOperationAction(ISD::FCOPYSIGN, V4Narrow, Custom);
-    setOperationAction(ISD::FFLOOR,      V4Narrow, Expand);
-    setOperationAction(ISD::FCEIL,       V4Narrow, Expand);
-    setOperationAction(ISD::FRINT,       V4Narrow, Expand);
-    setOperationAction(ISD::FNEARBYINT,  V4Narrow, Expand);
+    setOperationAction(ISD::FCOPYSIGN,   V4Narrow, Custom);
     setOperationAction(ISD::FSQRT,       V4Narrow, Expand);
 
     auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
-    setOperationAction(ISD::FABS, V8Narrow, Legal);
-    setOperationAction(ISD::FADD,        V8Narrow, Expand);
-    setOperationAction(ISD::FCEIL,       V8Narrow, Expand);
-    setOperationAction(ISD::FCOPYSIGN, V8Narrow, Custom);
-    setOperationAction(ISD::FDIV,        V8Narrow, Expand);
-    setOperationAction(ISD::FFLOOR,      V8Narrow, Expand);
+    setOperationAction(ISD::FABS,        V8Narrow, Legal);
+    setOperationAction(ISD::FADD,        V8Narrow, Legal);
+    setOperationAction(ISD::FCEIL,       V8Narrow, Legal);
+    setOperationAction(ISD::FCOPYSIGN,   V8Narrow, Custom);
+    setOperationAction(ISD::FDIV,        V8Narrow, Legal);
+    setOperationAction(ISD::FFLOOR,      V8Narrow, Legal);
     setOperationAction(ISD::FMA,         V8Narrow, Expand);
-    setOperationAction(ISD::FMUL,        V8Narrow, Expand);
-    setOperationAction(ISD::FNEARBYINT,  V8Narrow, Expand);
-    setOperationAction(ISD::FNEG, V8Narrow, Legal);
-    setOperationAction(ISD::FROUND,      V8Narrow, Expand);
-    setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Expand);
-    setOperationAction(ISD::FRINT,       V8Narrow, Expand);
+    setOperationAction(ISD::FMUL,        V8Narrow, Legal);
+    setOperationAction(ISD::FNEARBYINT,  V8Narrow, Legal);
+    setOperationAction(ISD::FNEG, 	 V8Narrow, Legal);
+    setOperationAction(ISD::FROUND,      V8Narrow, Legal);
+    setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Legal);
+    setOperationAction(ISD::FRINT,       V8Narrow, Legal);
     setOperationAction(ISD::FSQRT,       V8Narrow, Expand);
-    setOperationAction(ISD::FSUB,        V8Narrow, Expand);
-    setOperationAction(ISD::FTRUNC,      V8Narrow, Expand);
+    setOperationAction(ISD::FSUB,        V8Narrow, Legal);
+    setOperationAction(ISD::FTRUNC,      V8Narrow, Legal);
     setOperationAction(ISD::SETCC,       V8Narrow, Expand);
     setOperationAction(ISD::BR_CC,       V8Narrow, Expand);
     setOperationAction(ISD::SELECT,      V8Narrow, Expand);
@@ -10593,13 +10595,19 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
         VT == MVT::v4f32)) ||
       (ST->hasSVE() &&
        (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
-    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
+    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
       // For the reciprocal estimates, convergence is quadratic, so the number
       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
       // the result for float (23 mantissa bits) is 2 and for double (52
       // mantissa bits) is 3.
-      ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
+      constexpr unsigned AccurateBits = 8;
+      unsigned DesiredBits =
+          APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(VT));
+      ExtraSteps = DesiredBits <= AccurateBits
+                       ? 0
+                       : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
+    }
 
     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3c67f616c1b9ce..6254e68326f79d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -128,6 +128,7 @@ def HasRDM           : Predicate<"Subtarget->hasRDM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRDM), "rdm">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
+def HasNoFullFP16    : Predicate<"!Subtarget->hasFullFP16()">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
@@ -254,6 +255,7 @@ def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
                        AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
                        AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
+def HasNoBF16        : Predicate<"!Subtarget->hasBF16()">;
 def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
                        AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
 def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
@@ -764,6 +766,8 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
                                  [(int_aarch64_neon_fcvtxn node:$Rn),
                                   (AArch64fcvtxn_n node:$Rn)]>;
 
+//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
+
 def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
 def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
 
@@ -9739,6 +9743,93 @@ let Predicates = [HasCPA] in {
   def MSUBPT : MulAccumCPA<1, "msubpt">;
 }
 
+def round_v4fp32_to_v4bf16 :
+  OutPatFrag<(ops node:$Rn),
+             // NaN? Round : Quiet(NaN)
+             (BSPv16i8 (FCMEQv4f32 $Rn, $Rn),
+                       (ADDv4i32
+                         (ADDv4i32 $Rn,
+                           // Extract the LSB of the fp32 *truncated* to bf16.
+                           (ANDv16i8 (USHRv4i32_shift V128:$Rn, (i32 16)),
+                                     (MOVIv4i32 (i32 1), (i32 0)))),
+                         // Bias which will help us break ties correctly.
+                         (MOVIv4s_msl (i32 127), (i32 264))),
+                       // Set the quiet bit in the NaN.
+                       (ORRv4i32 $Rn, (i32 64), (i32 16)))>;
+
+multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst> {
+  let Predicates = [HasNoFullFP16] in
+  def : Pat<(InOp (v8f16 V128:$Rn)),
+            (v8f16 (FCVTNv8i16
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                             (v4f16 (FCVTNv4i16
+                               (v4f32 (OutInst
+                                 (v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+               dsub),
+              (v4f32 (OutInst (v4f32 (FCVTLv8i16 V128:$Rn))))))>;
+
+  let Predicates = [HasBF16] in
+  def : Pat<(InOp (v8bf16 V128:$Rn)),
+            (v8bf16 (BFCVTN2
+              (v8bf16 (BFCVTN
+                (v4f32 (OutInst
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+              (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
+
+  let Predicates = [HasNoBF16] in
+  def : Pat<(InOp (v8bf16 V128:$Rn)),
+            (UZP2v8i16
+              (round_v4fp32_to_v4bf16 (v4f32 (OutInst
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub))))))),
+              (round_v4fp32_to_v4bf16 (v4f32 (OutInst
+                  (v4f32 (SHLLv8i16 V128:$Rn))))))>;
+}
+defm : PromoteUnaryv8f16Tov4f32<any_fceil,  	FRINTPv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_ffloor, 	FRINTMv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_fnearbyint, FRINTIv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_fround, 	FRINTAv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_froundeven, FRINTNv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_frint,  	FRINTXv4f32>;
+defm : PromoteUnaryv8f16Tov4f32<any_ftrunc, 	FRINTZv4f32>;
+
+multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst> {
+  let Predicates = [HasNoFullFP16] in
+  def : Pat<(InOp (v8f16 V128:$Rn), (v8f16 V128:$Rm)),
+            (v8f16 (FCVTNv8i16
+              (INSERT_SUBREG (IMPLICIT_DEF),
+                             (v4f16 (FCVTNv4i16
+                               (v4f32 (OutInst
+                                 (v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+                                 (v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+               dsub),
+              (v4f32 (OutInst (v4f32 (FCVTLv8i16 V128:$Rn)),
+                              (v4f32 (FCVTLv8i16 V128:$Rm))))))>;
+
+  let Predicates = [HasBF16] in
+  def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
+            (v8bf16 (BFCVTN2
+              (v8bf16 (BFCVTN
+                (v4f32 (OutInst
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+              (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
+                              (v4f32 (SHLLv8i16 V128:$Rm))))))>;
+
+  let Predicates = [HasNoBF16] in
+  def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
+            (UZP2v8i16
+              (round_v4fp32_to_v4bf16 (v4f32 (OutInst
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+                  (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub))))))),
+              (round_v4fp32_to_v4bf16 (v4f32 (OutInst
+                  (v4f32 (SHLLv8i16 V128:$Rn)),
+                  (v4f32 (SHLLv8i16 V128:$Rm))))))>;
+}
+defm : PromoteBinaryv8f16Tov4f32<any_fadd, FADDv4f32>;
+defm : PromoteBinaryv8f16Tov4f32<any_fdiv, FDIVv4f32>;
+defm : PromoteBinaryv8f16Tov4f32<any_fmul, FMULv4f32>;
+defm : PromoteBinaryv8f16Tov4f32<any_fsub, FSUBv4f32>;
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 include "AArch64SMEInstrInfo.td"

diff  --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll
index 6068a4742eea99..447476e76ec64a 100644
--- a/llvm/test/CodeGen/AArch64/faddp-half.ll
+++ b/llvm/test/CodeGen/AArch64/faddp-half.ll
@@ -107,10 +107,15 @@ define half @faddp_8xhalf(<8 x half> %a) {
 ; CHECKNOFP16-LABEL: faddp_8xhalf:
 ; CHECKNOFP16:       // %bb.0: // %entry
 ; CHECKNOFP16-NEXT:    dup v1.8h, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s0, s0, s1
-; CHECKNOFP16-NEXT:    fcvt h0, s0
+; CHECKNOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECKNOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECKNOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; CHECKNOFP16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECKNOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECKNOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECKNOFP16-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECKNOFP16-NEXT:    ret
 entry:
   %shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -128,10 +133,15 @@ define half @faddp_8xhalf_commute(<8 x half> %a) {
 ; CHECKNOFP16-LABEL: faddp_8xhalf_commute:
 ; CHECKNOFP16:       // %bb.0: // %entry
 ; CHECKNOFP16-NEXT:    dup v1.8h, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s0, s1, s0
-; CHECKNOFP16-NEXT:    fcvt h0, s0
+; CHECKNOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECKNOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECKNOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECKNOFP16-NEXT:    fadd v1.4s, v1.4s, v0.4s
+; CHECKNOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECKNOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECKNOFP16-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECKNOFP16-NEXT:    ret
 entry:
   %shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -149,61 +159,15 @@ define <8 x half> @addp_v8f16(<8 x half> %a) {
 ;
 ; CHECKNOFP16-LABEL: addp_v8f16:
 ; CHECKNOFP16:       // %bb.0: // %entry
-; CHECKNOFP16-NEXT:    rev32 v2.8h, v0.8h
-; CHECKNOFP16-NEXT:    mov h1, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s4, h0
-; CHECKNOFP16-NEXT:    mov h5, v0.h[2]
-; CHECKNOFP16-NEXT:    mov h16, v0.h[3]
-; CHECKNOFP16-NEXT:    mov h3, v2.h[1]
-; CHECKNOFP16-NEXT:    fcvt s6, h2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    mov h7, v2.h[2]
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt s16, h16
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fadd s4, s6, s4
-; CHECKNOFP16-NEXT:    mov h6, v2.h[3]
-; CHECKNOFP16-NEXT:    fcvt s7, h7
-; CHECKNOFP16-NEXT:    fadd s3, s3, s1
-; CHECKNOFP16-NEXT:    fcvt s6, h6
-; CHECKNOFP16-NEXT:    fcvt h1, s4
-; CHECKNOFP16-NEXT:    fadd s4, s7, s5
-; CHECKNOFP16-NEXT:    mov h5, v0.h[4]
-; CHECKNOFP16-NEXT:    mov h7, v2.h[4]
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fadd s6, s6, s16
-; CHECKNOFP16-NEXT:    mov h16, v2.h[5]
-; CHECKNOFP16-NEXT:    fcvt h4, s4
-; CHECKNOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECKNOFP16-NEXT:    fcvt s3, h5
-; CHECKNOFP16-NEXT:    fcvt s5, h7
-; CHECKNOFP16-NEXT:    mov h7, v0.h[5]
-; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    fcvt s16, h16
-; CHECKNOFP16-NEXT:    mov v1.h[2], v4.h[0]
-; CHECKNOFP16-NEXT:    mov h4, v0.h[6]
-; CHECKNOFP16-NEXT:    fadd s3, s5, s3
-; CHECKNOFP16-NEXT:    mov h5, v2.h[6]
-; CHECKNOFP16-NEXT:    fcvt s7, h7
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    mov h2, v2.h[7]
-; CHECKNOFP16-NEXT:    mov v1.h[3], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fadd s6, s16, s7
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    mov v1.h[4], v3.h[0]
-; CHECKNOFP16-NEXT:    fadd s4, s5, s4
-; CHECKNOFP16-NEXT:    fcvt h3, s6
-; CHECKNOFP16-NEXT:    fadd s0, s2, s0
-; CHECKNOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECKNOFP16-NEXT:    fcvt h3, s4
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    mov v1.h[6], v3.h[0]
-; CHECKNOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECKNOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECKNOFP16-NEXT:    rev32 v1.8h, v0.8h
+; CHECKNOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECKNOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECKNOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECKNOFP16-NEXT:    fadd v1.4s, v1.4s, v0.4s
+; CHECKNOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECKNOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECKNOFP16-NEXT:    ret
 entry:
   %s = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -221,116 +185,24 @@ define <16 x half> @addp_v16f16(<16 x half> %a) {
 ;
 ; CHECKNOFP16-LABEL: addp_v16f16:
 ; CHECKNOFP16:       // %bb.0: // %entry
-; CHECKNOFP16-NEXT:    rev32 v5.8h, v0.8h
-; CHECKNOFP16-NEXT:    rev32 v4.8h, v1.8h
-; CHECKNOFP16-NEXT:    mov h3, v0.h[1]
-; CHECKNOFP16-NEXT:    mov h6, v1.h[1]
-; CHECKNOFP16-NEXT:    fcvt s16, h0
-; CHECKNOFP16-NEXT:    mov h17, v0.h[2]
-; CHECKNOFP16-NEXT:    fcvt s20, h1
-; CHECKNOFP16-NEXT:    mov h21, v1.h[2]
-; CHECKNOFP16-NEXT:    mov h2, v5.h[1]
-; CHECKNOFP16-NEXT:    mov h7, v4.h[1]
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s18, h5
-; CHECKNOFP16-NEXT:    mov h19, v5.h[2]
-; CHECKNOFP16-NEXT:    fcvt s6, h6
-; CHECKNOFP16-NEXT:    fcvt s22, h4
-; CHECKNOFP16-NEXT:    mov h23, v4.h[2]
-; CHECKNOFP16-NEXT:    fcvt s17, h17
-; CHECKNOFP16-NEXT:    mov h24, v5.h[3]
-; CHECKNOFP16-NEXT:    fcvt s21, h21
-; CHECKNOFP16-NEXT:    mov h25, v4.h[6]
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s7, h7
-; CHECKNOFP16-NEXT:    fadd s16, s18, s16
-; CHECKNOFP16-NEXT:    fcvt s18, h19
-; CHECKNOFP16-NEXT:    mov h19, v0.h[3]
-; CHECKNOFP16-NEXT:    fadd s20, s22, s20
-; CHECKNOFP16-NEXT:    fcvt s22, h23
-; CHECKNOFP16-NEXT:    mov h23, v4.h[3]
-; CHECKNOFP16-NEXT:    fadd s3, s2, s3
-; CHECKNOFP16-NEXT:    fadd s6, s7, s6
-; CHECKNOFP16-NEXT:    mov h7, v1.h[3]
-; CHECKNOFP16-NEXT:    fcvt h2, s16
-; CHECKNOFP16-NEXT:    fadd s16, s18, s17
-; CHECKNOFP16-NEXT:    fcvt s18, h19
-; CHECKNOFP16-NEXT:    fcvt s19, h24
-; CHECKNOFP16-NEXT:    mov h24, v5.h[6]
-; CHECKNOFP16-NEXT:    fcvt h17, s3
-; CHECKNOFP16-NEXT:    fcvt h3, s20
-; CHECKNOFP16-NEXT:    fadd s20, s22, s21
-; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    fcvt s7, h7
-; CHECKNOFP16-NEXT:    fcvt s22, h23
-; CHECKNOFP16-NEXT:    mov h21, v0.h[4]
-; CHECKNOFP16-NEXT:    mov h23, v5.h[4]
-; CHECKNOFP16-NEXT:    fcvt h16, s16
-; CHECKNOFP16-NEXT:    fadd s18, s19, s18
-; CHECKNOFP16-NEXT:    mov h19, v4.h[4]
-; CHECKNOFP16-NEXT:    mov v2.h[1], v17.h[0]
-; CHECKNOFP16-NEXT:    mov h17, v1.h[4]
-; CHECKNOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt h6, s20
-; CHECKNOFP16-NEXT:    fadd s7, s22, s7
-; CHECKNOFP16-NEXT:    fcvt s20, h21
-; CHECKNOFP16-NEXT:    mov h21, v0.h[5]
-; CHECKNOFP16-NEXT:    mov h22, v5.h[5]
-; CHECKNOFP16-NEXT:    fcvt h18, s18
-; CHECKNOFP16-NEXT:    fcvt s19, h19
-; CHECKNOFP16-NEXT:    mov h5, v5.h[7]
-; CHECKNOFP16-NEXT:    mov v2.h[2], v16.h[0]
-; CHECKNOFP16-NEXT:    fcvt s16, h23
-; CHECKNOFP16-NEXT:    fcvt s17, h17
-; CHECKNOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt h6, s7
-; CHECKNOFP16-NEXT:    mov h7, v1.h[5]
-; CHECKNOFP16-NEXT:    mov h23, v4.h[5]
-; CHECKNOFP16-NEXT:    mov h4, v4.h[7]
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fadd s16, s16, s20
-; CHECKNOFP16-NEXT:    mov h20, v0.h[6]
-; CHECKNOFP16-NEXT:    fadd s17, s19, s17
-; CHECKNOFP16-NEXT:    mov h19, v1.h[6]
-; CHECKNOFP16-NEXT:    mov v2.h[3], v18.h[0]
-; CHECKNOFP16-NEXT:    fcvt s18, h21
-; CHECKNOFP16-NEXT:    fcvt s21, h22
-; CHECKNOFP16-NEXT:    mov v3.h[3], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt s6, h7
-; CHECKNOFP16-NEXT:    fcvt s7, h23
-; CHECKNOFP16-NEXT:    fcvt s22, h24
-; CHECKNOFP16-NEXT:    fcvt s23, h25
-; CHECKNOFP16-NEXT:    fcvt h16, s16
-; CHECKNOFP16-NEXT:    fcvt s20, h20
-; CHECKNOFP16-NEXT:    fcvt h17, s17
-; CHECKNOFP16-NEXT:    fcvt s19, h19
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
-; CHECKNOFP16-NEXT:    fadd s18, s21, s18
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fadd s6, s7, s6
-; CHECKNOFP16-NEXT:    mov v2.h[4], v16.h[0]
-; CHECKNOFP16-NEXT:    fadd s7, s22, s20
-; CHECKNOFP16-NEXT:    mov v3.h[4], v17.h[0]
-; CHECKNOFP16-NEXT:    fadd s16, s23, s19
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fcvt h17, s18
-; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    fadd s0, s5, s0
-; CHECKNOFP16-NEXT:    fcvt h5, s7
-; CHECKNOFP16-NEXT:    fadd s1, s4, s1
-; CHECKNOFP16-NEXT:    mov v2.h[5], v17.h[0]
-; CHECKNOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt h6, s16
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    mov v2.h[6], v5.h[0]
-; CHECKNOFP16-NEXT:    mov v3.h[6], v6.h[0]
-; CHECKNOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECKNOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECKNOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECKNOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECKNOFP16-NEXT:    rev32 v2.8h, v0.8h
+; CHECKNOFP16-NEXT:    rev32 v3.8h, v1.8h
+; CHECKNOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECKNOFP16-NEXT:    fcvtl v6.4s, v1.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECKNOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECKNOFP16-NEXT:    fcvtl v5.4s, v2.4h
+; CHECKNOFP16-NEXT:    fcvtl v7.4s, v3.4h
+; CHECKNOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECKNOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECKNOFP16-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; CHECKNOFP16-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; CHECKNOFP16-NEXT:    fadd v2.4s, v2.4s, v0.4s
+; CHECKNOFP16-NEXT:    fadd v3.4s, v3.4s, v1.4s
+; CHECKNOFP16-NEXT:    fcvtn v0.4h, v4.4s
+; CHECKNOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECKNOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECKNOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECKNOFP16-NEXT:    ret
 entry:
   %s = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>

diff  --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 31389f5a77d6f7..f8970dc9e8d5d0 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -169,60 +169,14 @@ entry:
 define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fadd_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fadd_v7f16:
@@ -309,60 +263,14 @@ entry:
 define <8 x half> @fadd_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fadd_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fadd_v8f16:
@@ -394,114 +302,22 @@ entry:
 define <16 x half> @fadd_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fadd_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
-; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fadd s20, s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h25, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s16, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h23
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s20
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s16, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h22
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s18, s7
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[1], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s20, s6
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s19, s17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s18, s20, s18
-; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h24
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s21, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s16, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s22, s19
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[4], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s23, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s16
-; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s3
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[6], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v5.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fadd_v16f16:
@@ -705,60 +521,14 @@ entry:
 define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fsub_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fsub s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fsub s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fsub s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fsub s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fsub s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fsub_v7f16:
@@ -845,60 +615,14 @@ entry:
 define <8 x half> @fsub_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fsub_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fsub s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fsub s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fsub s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fsub s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fsub s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fsub_v8f16:
@@ -930,114 +654,22 @@ entry:
 define <16 x half> @fsub_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fsub_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
-; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fsub s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fsub s20, s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    fsub s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h25, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fsub s5, s16, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h23
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fsub s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s20
-; CHECK-SD-NOFP16-NEXT:    fsub s16, s16, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h22
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fsub s7, s18, s7
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[1], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fsub s6, s20, s6
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fsub s17, s19, s17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsub s18, s20, s18
-; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h24
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    fsub s7, s21, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fsub s6, s16, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsub s16, s22, s19
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[4], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsub s17, s23, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fsub s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s16
-; CHECK-SD-NOFP16-NEXT:    fsub s1, s1, s3
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[6], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v5.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fsub v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fsub v5.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fsub v2.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fsub v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fsub_v16f16:

diff  --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 584174d6d064ff..3b8a22a052b836 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -148,45 +148,12 @@ entry:
 define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintp v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: ceil_v7f16:
@@ -234,26 +201,9 @@ entry:
 define <4 x half> @ceil_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintp s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frintp v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: ceil_v4f16:
@@ -280,45 +230,12 @@ entry:
 define <8 x half> @ceil_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintp v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: ceil_v8f16:
@@ -348,84 +265,18 @@ entry:
 define <16 x half> @ceil_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintp s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frintp s18, s2
-; CHECK-SD-NOFP16-NEXT:    frintp s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintp s6, s7
-; CHECK-SD-NOFP16-NEXT:    frintp s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintp s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintp s7, s7
-; CHECK-SD-NOFP16-NEXT:    frintp s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frintp s7, s17
-; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintp s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frintp v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintp v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frintp v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintp v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: ceil_v16f16:
@@ -604,45 +455,12 @@ entry:
 define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintm v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: floor_v7f16:
@@ -690,26 +508,9 @@ entry:
 define <4 x half> @floor_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintm s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frintm v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: floor_v4f16:
@@ -736,45 +537,12 @@ entry:
 define <8 x half> @floor_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintm v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: floor_v8f16:
@@ -804,84 +572,18 @@ entry:
 define <16 x half> @floor_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintm s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frintm s18, s2
-; CHECK-SD-NOFP16-NEXT:    frintm s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintm s6, s7
-; CHECK-SD-NOFP16-NEXT:    frintm s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintm s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintm s7, s7
-; CHECK-SD-NOFP16-NEXT:    frintm s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frintm s7, s17
-; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintm s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frintm v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintm v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frintm v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintm v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: floor_v16f16:
@@ -1060,45 +762,12 @@ entry:
 define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frinti v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: nearbyint_v7f16:
@@ -1146,26 +815,9 @@ entry:
 define <4 x half> @nearbyint_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinti s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frinti v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: nearbyint_v4f16:
@@ -1192,45 +844,12 @@ entry:
 define <8 x half> @nearbyint_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frinti v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: nearbyint_v8f16:
@@ -1260,84 +879,18 @@ entry:
 define <16 x half> @nearbyint_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frinti s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frinti s18, s2
-; CHECK-SD-NOFP16-NEXT:    frinti s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frinti s6, s7
-; CHECK-SD-NOFP16-NEXT:    frinti s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinti s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frinti s7, s7
-; CHECK-SD-NOFP16-NEXT:    frinti s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frinti s7, s17
-; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
-; CHECK-SD-NOFP16-NEXT:    frinti s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frinti v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frinti v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frinti v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frinti v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: nearbyint_v16f16:
@@ -1516,45 +1069,12 @@ entry:
 define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintn v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: roundeven_v7f16:
@@ -1602,26 +1122,9 @@ entry:
 define <4 x half> @roundeven_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintn s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frintn v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: roundeven_v4f16:
@@ -1648,45 +1151,12 @@ entry:
 define <8 x half> @roundeven_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintn v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: roundeven_v8f16:
@@ -1716,84 +1186,18 @@ entry:
 define <16 x half> @roundeven_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintn s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frintn s18, s2
-; CHECK-SD-NOFP16-NEXT:    frintn s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintn s6, s7
-; CHECK-SD-NOFP16-NEXT:    frintn s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintn s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintn s7, s7
-; CHECK-SD-NOFP16-NEXT:    frintn s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frintn s7, s17
-; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintn s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frintn v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintn v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frintn v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintn v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: roundeven_v16f16:
@@ -1972,45 +1376,12 @@ entry:
 define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintx v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: rint_v7f16:
@@ -2058,26 +1429,9 @@ entry:
 define <4 x half> @rint_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintx s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frintx v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: rint_v4f16:
@@ -2104,45 +1458,12 @@ entry:
 define <8 x half> @rint_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintx v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: rint_v8f16:
@@ -2172,84 +1493,18 @@ entry:
 define <16 x half> @rint_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintx s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frintx s18, s2
-; CHECK-SD-NOFP16-NEXT:    frintx s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintx s6, s7
-; CHECK-SD-NOFP16-NEXT:    frintx s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintx s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintx s7, s7
-; CHECK-SD-NOFP16-NEXT:    frintx s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frintx s7, s17
-; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintx s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frintx v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintx v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frintx v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintx v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: rint_v16f16:
@@ -2428,45 +1683,12 @@ entry:
 define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frinta v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: round_v7f16:
@@ -2514,26 +1736,9 @@ entry:
 define <4 x half> @round_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinta s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frinta v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: round_v4f16:
@@ -2560,45 +1765,12 @@ entry:
 define <8 x half> @round_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frinta v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: round_v8f16:
@@ -2628,84 +1800,18 @@ entry:
 define <16 x half> @round_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frinta s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frinta s18, s2
-; CHECK-SD-NOFP16-NEXT:    frinta s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frinta s6, s7
-; CHECK-SD-NOFP16-NEXT:    frinta s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinta s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frinta s7, s7
-; CHECK-SD-NOFP16-NEXT:    frinta s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frinta s7, s17
-; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
-; CHECK-SD-NOFP16-NEXT:    frinta s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frinta v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frinta v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frinta v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frinta v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: round_v16f16:
@@ -2884,45 +1990,12 @@ entry:
 define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintz v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: trunc_v7f16:
@@ -2970,26 +2043,9 @@ entry:
 define <4 x half> @trunc_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v4f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintz s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    frintz v0.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: trunc_v4f16:
@@ -3016,45 +2072,12 @@ entry:
 define <8 x half> @trunc_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    frintz v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: trunc_v8f16:
@@ -3084,84 +2107,18 @@ entry:
 define <16 x half> @trunc_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintz s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    frintz s18, s2
-; CHECK-SD-NOFP16-NEXT:    frintz s19, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintz s6, s7
-; CHECK-SD-NOFP16-NEXT:    frintz s16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintz s18, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintz s7, s7
-; CHECK-SD-NOFP16-NEXT:    frintz s16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    frintz s7, s17
-; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintz s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    frintz v2.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintz v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    frintz v3.4s, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    frintz v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: trunc_v16f16:

diff  --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 37133cf0aa1df9..29170aab965665 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -180,48 +180,19 @@ define <3 x i32> @test_illegal_fp_to_int(<3 x float> %in) {
 define <8 x i16> @test_v8f16(<8 x half> %in) {
 ; CHECK-NO16-LABEL: test_v8f16:
 ; CHECK-NO16:       // %bb.0:
-; CHECK-NO16-NEXT:    mov h2, v0.h[1]
-; CHECK-NO16-NEXT:    mov h3, v0.h[4]
-; CHECK-NO16-NEXT:    mov h4, v0.h[5]
-; CHECK-NO16-NEXT:    mov h5, v0.h[2]
-; CHECK-NO16-NEXT:    fcvt s6, h0
-; CHECK-NO16-NEXT:    mov h7, v0.h[6]
-; CHECK-NO16-NEXT:    fmov s1, #4.00000000
-; CHECK-NO16-NEXT:    mov h16, v0.h[3]
-; CHECK-NO16-NEXT:    mov h0, v0.h[7]
-; CHECK-NO16-NEXT:    fcvt s2, h2
-; CHECK-NO16-NEXT:    fcvt s3, h3
-; CHECK-NO16-NEXT:    fcvt s4, h4
-; CHECK-NO16-NEXT:    fmul s6, s6, s1
-; CHECK-NO16-NEXT:    fcvt s5, h5
-; CHECK-NO16-NEXT:    fcvt s7, h7
-; CHECK-NO16-NEXT:    fcvt s16, h16
-; CHECK-NO16-NEXT:    fcvt s0, h0
-; CHECK-NO16-NEXT:    fmul s2, s2, s1
-; CHECK-NO16-NEXT:    fmul s3, s3, s1
-; CHECK-NO16-NEXT:    fmul s4, s4, s1
-; CHECK-NO16-NEXT:    fmul s5, s5, s1
-; CHECK-NO16-NEXT:    fcvt h6, s6
-; CHECK-NO16-NEXT:    fmul s7, s7, s1
-; CHECK-NO16-NEXT:    fmul s16, s16, s1
-; CHECK-NO16-NEXT:    fmul s0, s0, s1
-; CHECK-NO16-NEXT:    fcvt h2, s2
-; CHECK-NO16-NEXT:    fcvt h3, s3
-; CHECK-NO16-NEXT:    fcvt h4, s4
-; CHECK-NO16-NEXT:    fcvt h5, s5
-; CHECK-NO16-NEXT:    fcvt h1, s7
-; CHECK-NO16-NEXT:    fcvt h0, s0
-; CHECK-NO16-NEXT:    mov v6.h[1], v2.h[0]
-; CHECK-NO16-NEXT:    fcvt h2, s16
-; CHECK-NO16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-NO16-NEXT:    mov v6.h[2], v5.h[0]
-; CHECK-NO16-NEXT:    mov v3.h[2], v1.h[0]
-; CHECK-NO16-NEXT:    mov v6.h[3], v2.h[0]
-; CHECK-NO16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-NO16-NEXT:    fcvtl v1.4s, v6.4h
-; CHECK-NO16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-NO16-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NO16-NEXT:    movi v1.8h, #68, lsl #8
+; CHECK-NO16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-NO16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NO16-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NO16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-NO16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-NO16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v1.8h
+; CHECK-NO16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-NO16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NO16-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NO16-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -496,96 +467,67 @@ define <3 x i32> @test_illegal_fp_to_int_sat_sat(<3 x float> %in) {
 define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
 ; CHECK-NO16-LABEL: test_v8f16_sat:
 ; CHECK-NO16:       // %bb.0:
-; CHECK-NO16-NEXT:    mov h2, v0.h[4]
-; CHECK-NO16-NEXT:    mov h3, v0.h[5]
+; CHECK-NO16-NEXT:    movi v1.8h, #68, lsl #8
+; CHECK-NO16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NO16-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NO16-NEXT:    mov h4, v0.h[6]
-; CHECK-NO16-NEXT:    fmov s1, #4.00000000
+; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NO16-NEXT:    mov w11, #-32768 // =0xffff8000
-; CHECK-NO16-NEXT:    mov h5, v0.h[7]
-; CHECK-NO16-NEXT:    mov h6, v0.h[1]
-; CHECK-NO16-NEXT:    mov h7, v0.h[2]
-; CHECK-NO16-NEXT:    fcvt s16, h0
-; CHECK-NO16-NEXT:    mov h0, v0.h[3]
-; CHECK-NO16-NEXT:    fcvt s2, h2
-; CHECK-NO16-NEXT:    fcvt s3, h3
-; CHECK-NO16-NEXT:    fcvt s4, h4
-; CHECK-NO16-NEXT:    fcvt s5, h5
-; CHECK-NO16-NEXT:    fcvt s6, h6
-; CHECK-NO16-NEXT:    fmul s2, s2, s1
-; CHECK-NO16-NEXT:    fmul s3, s3, s1
-; CHECK-NO16-NEXT:    fmul s4, s4, s1
-; CHECK-NO16-NEXT:    fmul s5, s5, s1
-; CHECK-NO16-NEXT:    fmul s6, s6, s1
-; CHECK-NO16-NEXT:    fcvt h2, s2
-; CHECK-NO16-NEXT:    fcvt h3, s3
-; CHECK-NO16-NEXT:    fcvt h4, s4
-; CHECK-NO16-NEXT:    fcvt h5, s5
-; CHECK-NO16-NEXT:    fcvt h6, s6
-; CHECK-NO16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NO16-NEXT:    fcvt s3, h7
-; CHECK-NO16-NEXT:    fmul s7, s16, s1
-; CHECK-NO16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-NO16-NEXT:    fcvt s4, h0
-; CHECK-NO16-NEXT:    fmul s3, s3, s1
-; CHECK-NO16-NEXT:    fcvt h0, s7
-; CHECK-NO16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-NO16-NEXT:    fmul s1, s4, s1
-; CHECK-NO16-NEXT:    fcvt h3, s3
-; CHECK-NO16-NEXT:    mov v0.h[1], v6.h[0]
-; CHECK-NO16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NO16-NEXT:    fcvt h1, s1
-; CHECK-NO16-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-NO16-NEXT:    mov s4, v2.s[1]
-; CHECK-NO16-NEXT:    fcvtzs w10, s2
-; CHECK-NO16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-NO16-NEXT:    mov s1, v2.s[2]
-; CHECK-NO16-NEXT:    mov s2, v2.s[3]
-; CHECK-NO16-NEXT:    fcvtzs w9, s4
-; CHECK-NO16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NO16-NEXT:    fcvtzs w12, s1
-; CHECK-NO16-NEXT:    fcvtzs w13, s2
+; CHECK-NO16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NO16-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NO16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-NO16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-NO16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v1.8h
+; CHECK-NO16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NO16-NEXT:    mov s2, v0.s[1]
+; CHECK-NO16-NEXT:    fcvtzs w10, s0
+; CHECK-NO16-NEXT:    fcvtzs w15, s1
+; CHECK-NO16-NEXT:    fcvtzs w9, s2
+; CHECK-NO16-NEXT:    mov s2, v0.s[2]
+; CHECK-NO16-NEXT:    mov s0, v0.s[3]
 ; CHECK-NO16-NEXT:    cmp w9, w8
+; CHECK-NO16-NEXT:    fcvtzs w12, s2
+; CHECK-NO16-NEXT:    mov s2, v1.s[1]
 ; CHECK-NO16-NEXT:    csel w9, w9, w8, lt
-; CHECK-NO16-NEXT:    mov s1, v0.s[1]
-; CHECK-NO16-NEXT:    fcvtzs w15, s0
+; CHECK-NO16-NEXT:    fcvtzs w13, s0
+; CHECK-NO16-NEXT:    mov s0, v1.s[2]
 ; CHECK-NO16-NEXT:    cmn w9, #8, lsl #12 // =32768
 ; CHECK-NO16-NEXT:    csel w9, w9, w11, gt
 ; CHECK-NO16-NEXT:    cmp w10, w8
 ; CHECK-NO16-NEXT:    csel w10, w10, w8, lt
+; CHECK-NO16-NEXT:    fcvtzs w14, s2
 ; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    fcvtzs w14, s1
-; CHECK-NO16-NEXT:    mov s1, v0.s[2]
+; CHECK-NO16-NEXT:    fcvtzs w16, s0
+; CHECK-NO16-NEXT:    mov s0, v1.s[3]
 ; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
 ; CHECK-NO16-NEXT:    cmp w12, w8
-; CHECK-NO16-NEXT:    mov s0, v0.s[3]
 ; CHECK-NO16-NEXT:    csel w12, w12, w8, lt
+; CHECK-NO16-NEXT:    fmov s1, w10
 ; CHECK-NO16-NEXT:    cmn w12, #8, lsl #12 // =32768
 ; CHECK-NO16-NEXT:    csel w12, w12, w11, gt
 ; CHECK-NO16-NEXT:    cmp w13, w8
-; CHECK-NO16-NEXT:    fcvtzs w16, s1
 ; CHECK-NO16-NEXT:    csel w13, w13, w8, lt
-; CHECK-NO16-NEXT:    fmov s1, w10
+; CHECK-NO16-NEXT:    mov v1.s[1], w9
+; CHECK-NO16-NEXT:    fcvtzs w9, s0
 ; CHECK-NO16-NEXT:    cmn w13, #8, lsl #12 // =32768
 ; CHECK-NO16-NEXT:    csel w13, w13, w11, gt
 ; CHECK-NO16-NEXT:    cmp w14, w8
 ; CHECK-NO16-NEXT:    csel w14, w14, w8, lt
-; CHECK-NO16-NEXT:    mov v1.s[1], w9
-; CHECK-NO16-NEXT:    fcvtzs w9, s0
 ; CHECK-NO16-NEXT:    cmn w14, #8, lsl #12 // =32768
+; CHECK-NO16-NEXT:    mov v1.s[2], w12
 ; CHECK-NO16-NEXT:    csel w14, w14, w11, gt
 ; CHECK-NO16-NEXT:    cmp w15, w8
 ; CHECK-NO16-NEXT:    csel w15, w15, w8, lt
 ; CHECK-NO16-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    mov v1.s[2], w12
 ; CHECK-NO16-NEXT:    csel w10, w15, w11, gt
 ; CHECK-NO16-NEXT:    cmp w16, w8
+; CHECK-NO16-NEXT:    mov v1.s[3], w13
 ; CHECK-NO16-NEXT:    fmov s2, w10
 ; CHECK-NO16-NEXT:    csel w10, w16, w8, lt
 ; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
 ; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
 ; CHECK-NO16-NEXT:    cmp w9, w8
-; CHECK-NO16-NEXT:    mov v1.s[3], w13
 ; CHECK-NO16-NEXT:    mov v2.s[1], w14
 ; CHECK-NO16-NEXT:    csel w8, w9, w8, lt
 ; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768

diff  --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index fa87c4fa2d1660..e73124fbb595bb 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -169,60 +169,14 @@ entry:
 define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fdiv_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s2, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h1
-; CHECK-SD-NOFP16-NEXT:    fdiv s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fdiv s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fdiv s3, s17, s16
-; CHECK-SD-NOFP16-NEXT:    fdiv s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s6
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s7
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fdiv_v7f16:
@@ -309,60 +263,14 @@ entry:
 define <8 x half> @fdiv_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fdiv_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s2, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h1
-; CHECK-SD-NOFP16-NEXT:    fdiv s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fdiv s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fdiv s3, s17, s16
-; CHECK-SD-NOFP16-NEXT:    fdiv s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s6
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s7
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fdiv_v8f16:
@@ -394,113 +302,22 @@ entry:
 define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fdiv_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h1
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h22
-; CHECK-SD-NOFP16-NEXT:    mov h24, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fdiv s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h23
-; CHECK-SD-NOFP16-NEXT:    mov h25, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h25
-; CHECK-SD-NOFP16-NEXT:    fdiv s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fdiv s7, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fdiv s6, s16, s6
-; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fdiv s16, s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fdiv s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fdiv s18, s19, s18
-; CHECK-SD-NOFP16-NEXT:    fdiv s19, s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fdiv s2, s2, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h3
-; CHECK-SD-NOFP16-NEXT:    fdiv s20, s20, s0
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s21, s21, s0
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s22, s22, s0
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s23, s23, s0
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fdiv s24, s24, s0
-; CHECK-SD-NOFP16-NEXT:    mov h0, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s7
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s21
-; CHECK-SD-NOFP16-NEXT:    fdiv s20, s25, s26
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s6
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s22
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s23
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fdiv s1, s1, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s24
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s17
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s20
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s18
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[7], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fdiv v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fdiv v0.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fdiv v2.4s, v5.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fdiv v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fdiv_v16f16:

diff  --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 339ade5fc7074f..336c9705f399d9 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -840,99 +840,22 @@ entry:
 define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmuladd_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-SD-NOFP16-NEXT:    mov h20, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h17
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s4, s19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s18, s16
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s3, s17
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s6, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s7, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s20, s18
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s16, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s6, s7
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s1, s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmuladd_v7f16:
@@ -1045,99 +968,22 @@ entry:
 define <8 x half> @fmuladd_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmuladd_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-SD-NOFP16-NEXT:    mov h20, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h17
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s4, s19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s18, s16
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s3, s17
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s6, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s7, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s20, s18
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s16, s17
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s6, s6, s7
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s6
-; CHECK-SD-NOFP16-NEXT:    fadd s1, s5, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmuladd_v8f16:
@@ -1179,192 +1025,38 @@ entry:
 define <16 x half> @fmuladd_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmuladd_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h0
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h24, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h25, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h26, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h29, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s17, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov h17, v4.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h25
-; CHECK-SD-NOFP16-NEXT:    mov h30, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s23, s19, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s16
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmul s20, s21, s20
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v4.h[2]
-; CHECK-SD-NOFP16-NEXT:    fmul s24, s25, s24
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h25, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h20, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v4.h[3]
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s21, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s19, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h20
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s23, s17
-; CHECK-SD-NOFP16-NEXT:    mov h23, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h6
-; CHECK-SD-NOFP16-NEXT:    fmul s22, s27, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s7
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s21, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h26
-; CHECK-SD-NOFP16-NEXT:    mov h26, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    mov h27, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    mov h17, v4.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    fadd s19, s19, s28
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    mov h23, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fmul s20, s21, s20
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[1], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v5.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt h21, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s24
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    mov h26, v5.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h20, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h22
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s25, s24
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h5
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[3], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h23
-; CHECK-SD-NOFP16-NEXT:    mov h23, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h29
-; CHECK-SD-NOFP16-NEXT:    fadd s22, s22, s28
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s21, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    mov h21, v5.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h28, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h29, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmul s19, s26, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fadd s20, s20, s25
-; CHECK-SD-NOFP16-NEXT:    fmul s25, s27, s7
-; CHECK-SD-NOFP16-NEXT:    mov h27, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h28
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h29
-; CHECK-SD-NOFP16-NEXT:    fmul s23, s24, s23
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h27
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h30
-; CHECK-SD-NOFP16-NEXT:    fcvt h20, s20
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[1], v22.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s25
-; CHECK-SD-NOFP16-NEXT:    mov h25, v5.h[4]
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s17, s18
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s28, s21
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s21, s26, s24
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov h24, v4.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h25
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[2], v20.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    mov h20, v5.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h23, s23
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov h2, v4.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h21, s21
-; CHECK-SD-NOFP16-NEXT:    fadd s22, s22, s25
-; CHECK-SD-NOFP16-NEXT:    mov h25, v5.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[3], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h24
-; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s17, s19
-; CHECK-SD-NOFP16-NEXT:    mov h4, v4.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v5.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s20, s2
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[4], v22.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s21, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[5], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s4
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s16
-; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s5
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v6.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v7.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v6.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v7.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v16.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v6.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v7.4s, v17.4s, v16.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v2.4h, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v5.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtn v3.4h, v7.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v2.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v4.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v3.8h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v5.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmuladd_v16f16:
@@ -1650,99 +1342,22 @@ entry:
 define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h2
-; CHECK-SD-NOFP16-NEXT:    mov h22, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s6, s4
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s21
-; CHECK-SD-NOFP16-NEXT:    mov h21, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s20, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h20, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s4, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h21
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s16, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s6, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v7f16:
@@ -1857,99 +1472,22 @@ entry:
 define <8 x half> @fmul_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h2
-; CHECK-SD-NOFP16-NEXT:    mov h22, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s6, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s4, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s6, s4
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s5, s21
-; CHECK-SD-NOFP16-NEXT:    mov h21, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s20, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h20, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s4, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h21
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s7, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s5, s16, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s6, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v8f16:
@@ -1992,199 +1530,38 @@ entry:
 define <16 x half> @fmul_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-SD-NOFP16-NEXT:    stp d9, d8, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NOFP16-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-SD-NOFP16-NEXT:    .cfi_offset b8, -8
-; CHECK-SD-NOFP16-NEXT:    .cfi_offset b9, -16
-; CHECK-SD-NOFP16-NEXT:    .cfi_offset b10, -24
-; CHECK-SD-NOFP16-NEXT:    .cfi_offset b11, -32
-; CHECK-SD-NOFP16-NEXT:    mov h6, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h24, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h26, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h27, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h28, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h26
-; CHECK-SD-NOFP16-NEXT:    mov h26, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s29, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s25, s6
-; CHECK-SD-NOFP16-NEXT:    mov h25, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h27
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s22, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h28
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fmul s23, s24, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    fmul s26, s1, s29
-; CHECK-SD-NOFP16-NEXT:    fmul s27, s28, s27
-; CHECK-SD-NOFP16-NEXT:    mov h28, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fmul s1, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s23
-; CHECK-SD-NOFP16-NEXT:    mov h29, v4.h[1]
-; CHECK-SD-NOFP16-NEXT:    fmul s21, s22, s21
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s16, s19
-; CHECK-SD-NOFP16-NEXT:    mov h8, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s23, s25, s24
-; CHECK-SD-NOFP16-NEXT:    mov h24, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h25, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s7, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s26
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s27
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h28
-; CHECK-SD-NOFP16-NEXT:    mov h26, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h27, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h21, s21
-; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h11, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h28, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt s30, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s31, h25
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s29, h29
-; CHECK-SD-NOFP16-NEXT:    mov h9, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT:    mov h10, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    mov h23, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h11
-; CHECK-SD-NOFP16-NEXT:    fmul s27, s31, s30
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h28
-; CHECK-SD-NOFP16-NEXT:    fcvt s30, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s31, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h8
-; CHECK-SD-NOFP16-NEXT:    mov h8, v5.h[1]
-; CHECK-SD-NOFP16-NEXT:    fmul s25, s26, s25
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s20, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h9
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h10
-; CHECK-SD-NOFP16-NEXT:    fadd s26, s28, s29
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h27, s27
-; CHECK-SD-NOFP16-NEXT:    fadd s28, s30, s31
-; CHECK-SD-NOFP16-NEXT:    mov h29, v4.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h30, v5.h[2]
-; CHECK-SD-NOFP16-NEXT:    fmul s24, s24, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt s31, h8
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fmul s21, s22, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s8, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h25, s25
-; CHECK-SD-NOFP16-NEXT:    fmul s20, s23, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt h26, s26
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h27
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s28
-; CHECK-SD-NOFP16-NEXT:    mov h28, v4.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s29, h29
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s30, h30
-; CHECK-SD-NOFP16-NEXT:    fadd s19, s19, s31
-; CHECK-SD-NOFP16-NEXT:    fadd s18, s18, s8
-; CHECK-SD-NOFP16-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NOFP16-NEXT:    fcvt h22, s1
-; CHECK-SD-NOFP16-NEXT:    mov h23, v5.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h25
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v26.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h28
-; CHECK-SD-NOFP16-NEXT:    fadd s27, s27, s29
-; CHECK-SD-NOFP16-NEXT:    fcvt h24, s24
-; CHECK-SD-NOFP16-NEXT:    fadd s17, s17, s30
-; CHECK-SD-NOFP16-NEXT:    mov h28, v4.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h20, s20
-; CHECK-SD-NOFP16-NEXT:    fadd s23, s25, s26
-; CHECK-SD-NOFP16-NEXT:    mov h25, v5.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h26, s27
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h28
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v4.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    fadd s18, s18, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h21, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h25
-; CHECK-SD-NOFP16-NEXT:    mov h25, v5.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v26.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h23, s23
-; CHECK-SD-NOFP16-NEXT:    fadd s24, s24, s27
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov h26, v4.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s16, s16, s22
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h25
-; CHECK-SD-NOFP16-NEXT:    mov h22, v5.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v23.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h23, s24
-; CHECK-SD-NOFP16-NEXT:    fadd s19, s20, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h26
-; CHECK-SD-NOFP16-NEXT:    mov h4, v4.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fadd s7, s7, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h22
-; CHECK-SD-NOFP16-NEXT:    mov h5, v5.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[4], v23.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fadd s19, s20, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[5], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s19
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[6], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    mov v0.h[7], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    ldp d11, d10, [sp], #32 // 16-byte Folded Reload
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v16.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v6.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v7.4s, v17.4s, v16.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v2.4h, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v5.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtn v3.4h, v7.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v2.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v4.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v3.8h, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v5.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v5.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v2.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v16f16:

diff  --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index fe84fe1f317a07..1f49601a18272f 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -169,60 +169,14 @@ entry:
 define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v7f16:
@@ -309,60 +263,14 @@ entry:
 define <8 x half> @fmul_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s5, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s6, s3
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s16, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v8f16:
@@ -394,114 +302,22 @@ entry:
 define <16 x half> @fmul_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fmul_v16f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h0
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
-; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s17, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fmul s20, s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h23
-; CHECK-SD-NOFP16-NEXT:    fmul s5, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h25, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fmul s17, s18, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s16
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s22, s21
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov h20, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmul s19, s24, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s17
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h24, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s21, s7
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h23
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[2], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmul s18, s20, s18
-; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h17, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    fmul s19, s21, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fmul s6, s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmul s7, s22, s20
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmul s16, s23, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s19
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s7
-; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s3
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], v17.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[5], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[6], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[7], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v5.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v6.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v5.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v0.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v3.4s, v1.4s, v3.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v5.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v3.4s
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fmul_v16f16:

diff  --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index ded343b990ac15..d4130e7a848b15 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -5,60 +5,14 @@
 define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-LABEL: add_h:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    mov h7, v0.h[2]
-; CHECK-CVT-NEXT:    mov h16, v1.h[3]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fadd s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fadd s3, s3, s2
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fadd s4, s7, s6
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fadd s5, s5, s16
-; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt h4, s4
-; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt s3, h6
-; CHECK-CVT-NEXT:    fcvt s6, h7
-; CHECK-CVT-NEXT:    mov h7, v1.h[5]
-; CHECK-CVT-NEXT:    fcvt h5, s5
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-CVT-NEXT:    mov h4, v1.h[6]
-; CHECK-CVT-NEXT:    fadd s3, s6, s3
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fadd s6, s16, s7
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-CVT-NEXT:    fadd s4, s5, s4
-; CHECK-CVT-NEXT:    fcvt h3, s6
-; CHECK-CVT-NEXT:    fadd s0, s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s4
-; CHECK-CVT-NEXT:    fcvt h0, s0
-; CHECK-CVT-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-CVT-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-CVT-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: add_h:
@@ -74,60 +28,14 @@ entry:
 define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-LABEL: sub_h:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    mov h7, v0.h[2]
-; CHECK-CVT-NEXT:    mov h16, v1.h[3]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fsub s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fsub s3, s3, s2
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fsub s4, s7, s6
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fsub s5, s5, s16
-; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt h4, s4
-; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt s3, h6
-; CHECK-CVT-NEXT:    fcvt s6, h7
-; CHECK-CVT-NEXT:    mov h7, v1.h[5]
-; CHECK-CVT-NEXT:    fcvt h5, s5
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-CVT-NEXT:    mov h4, v1.h[6]
-; CHECK-CVT-NEXT:    fsub s3, s6, s3
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fsub s6, s16, s7
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-CVT-NEXT:    fsub s4, s5, s4
-; CHECK-CVT-NEXT:    fcvt h3, s6
-; CHECK-CVT-NEXT:    fsub s0, s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s4
-; CHECK-CVT-NEXT:    fcvt h0, s0
-; CHECK-CVT-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-CVT-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-CVT-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sub_h:
@@ -143,60 +51,14 @@ entry:
 define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-LABEL: mul_h:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    mov h7, v0.h[2]
-; CHECK-CVT-NEXT:    mov h16, v1.h[3]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fmul s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fmul s3, s3, s2
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fmul s4, s7, s6
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fmul s5, s5, s16
-; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt h4, s4
-; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt s3, h6
-; CHECK-CVT-NEXT:    fcvt s6, h7
-; CHECK-CVT-NEXT:    mov h7, v1.h[5]
-; CHECK-CVT-NEXT:    fcvt h5, s5
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-CVT-NEXT:    mov h4, v1.h[6]
-; CHECK-CVT-NEXT:    fmul s3, s6, s3
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fmul s6, s16, s7
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
-; CHECK-CVT-NEXT:    fmul s4, s5, s4
-; CHECK-CVT-NEXT:    fcvt h3, s6
-; CHECK-CVT-NEXT:    fmul s0, s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s4
-; CHECK-CVT-NEXT:    fcvt h0, s0
-; CHECK-CVT-NEXT:    mov v2.h[6], v3.h[0]
-; CHECK-CVT-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-CVT-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: mul_h:
@@ -212,60 +74,14 @@ entry:
 define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-LABEL: div_h:
 ; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h0
-; CHECK-CVT-NEXT:    mov h5, v0.h[2]
-; CHECK-CVT-NEXT:    mov h6, v0.h[3]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    mov h17, v0.h[6]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fcvt s17, h17
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    fdiv s2, s3, s2
-; CHECK-CVT-NEXT:    fcvt s3, h1
-; CHECK-CVT-NEXT:    fdiv s3, s4, s3
-; CHECK-CVT-NEXT:    mov h4, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt h18, s2
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fdiv s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v1.h[3]
-; CHECK-CVT-NEXT:    fcvt h2, s3
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[1], v18.h[0]
-; CHECK-CVT-NEXT:    fdiv s5, s6, s5
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt h4, s4
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-CVT-NEXT:    fdiv s6, s7, s6
-; CHECK-CVT-NEXT:    mov h7, v1.h[5]
-; CHECK-CVT-NEXT:    fcvt h4, s5
-; CHECK-CVT-NEXT:    fcvt s7, h7
-; CHECK-CVT-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-CVT-NEXT:    fdiv s7, s16, s7
-; CHECK-CVT-NEXT:    mov h16, v1.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fdiv s3, s17, s16
-; CHECK-CVT-NEXT:    fdiv s0, s0, s1
-; CHECK-CVT-NEXT:    fcvt h1, s6
-; CHECK-CVT-NEXT:    mov v2.h[4], v1.h[0]
-; CHECK-CVT-NEXT:    fcvt h1, s7
-; CHECK-CVT-NEXT:    mov v2.h[5], v1.h[0]
-; CHECK-CVT-NEXT:    fcvt h1, s3
-; CHECK-CVT-NEXT:    mov v2.h[6], v1.h[0]
-; CHECK-CVT-NEXT:    fcvt h0, s0
-; CHECK-CVT-NEXT:    mov v2.h[7], v0.h[0]
-; CHECK-CVT-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: div_h:

diff  --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
index 1b2e0be6111c01..aefc8de431436a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
@@ -17,8 +17,6 @@ define <vscale x 8 x half> @fdiv_recip_8f16(<vscale x 8 x half> %a, <vscale x 8
 ; CHECK-LABEL: fdiv_recip_8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frecpe z2.h, z1.h
-; CHECK-NEXT:    frecps z3.h, z1.h, z2.h
-; CHECK-NEXT:    fmul z2.h, z2.h, z3.h
 ; CHECK-NEXT:    frecps z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fmul z1.h, z2.h, z1.h
 ; CHECK-NEXT:    fmul z0.h, z1.h, z0.h
@@ -98,9 +96,6 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    frsqrts z2.h, z0.h, z2.h
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    fmul z2.h, z1.h, z1.h
-; CHECK-NEXT:    frsqrts z2.h, z0.h, z2.h
-; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %fsqrt = call fast <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> %a)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 99c6808724b5b5..03db1d0d433d3c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -147,79 +147,48 @@ define double @add_D(<2 x double> %bin.rdx)  {
 define half @add_2H(<16 x half> %bin.rdx)  {
 ; CHECK-SD-NOFP16-LABEL: add_2H:
 ; CHECK-SD-NOFP16:       // %bb.0:
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s4, s2
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -574,78 +543,47 @@ exit:
 define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0:
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s4, s2
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fadd s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
-; CHECK-SD-NOFP16-NEXT:    fadd s1, s3, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
 ; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index e85384e46222e7..da75a805952120 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -182,79 +182,48 @@ define double @mul_D(<2 x double> %bin.rdx)  {
 define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-SD-NOFP16-LABEL: mul_2H:
 ; CHECK-SD-NOFP16:       // %bb.0:
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s3, s2
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s4, s2
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s4, s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s5, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -361,78 +330,47 @@ define float @mul_S_init_42(<4 x float> %bin.rdx)  {
 define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-SD-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0:
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s4, s2
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s5, s3
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fcvtn2 v1.8h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fmul s3, s3, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
-; CHECK-SD-NOFP16-NEXT:    fmul s1, s3, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
 ; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    ret


        


More information about the llvm-commits mailing list