[llvm] [LLVM][CodeGen][AArch64] Don't scalarise v8{f16,bf16} vsetcc operations. (PR #135398)

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 15 10:41:46 PDT 2025


https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/135398

>From b7cfa4a572ce18a354a2bc705f37d0d959389954 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 15 Apr 2025 13:55:43 +0000
Subject: [PATCH 1/2] [LLVM][CostModel][AArch64] Remove magic numbers from f16
 vector compares.

The PR also extends the code to cover bfloat vector compares that are
also promoted to float.

NOTE: There is a bail out for the compares that are scalarised that
will be removed by https://github.com/llvm/llvm-project/pull/135398.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 32 ++++++++++++++++---
 .../CostModel/AArch64/vector-select.ll        | 16 +++++-----
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 2b9d32f9208fe..f79b8277b4cd1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4236,10 +4236,34 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
   }
 
   if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
-    auto LT = getTypeLegalizationCost(ValTy);
-    // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
-    if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
-      return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
+    Type *ValScalarTy = ValTy->getScalarType();
+    if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) ||
+        ValScalarTy->isBFloatTy()) {
+      auto *ValVTy = cast<FixedVectorType>(ValTy);
+
+      // FIXME: We currently scalarise these.
+      if (ValVTy->getNumElements() > 4)
+        return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
+                                         CostKind, Op1Info, Op2Info, I);
+
+      // Without dedicated instructions we promote [b]f16 compares to f32.
+      auto *PromotedTy =
+          VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy);
+
+      InstructionCost Cost = 0;
+      // Promte operands to float vectors.
+      Cost += 2 * getCastInstrCost(Instruction::FPExt, PromotedTy, ValTy,
+                                   TTI::CastContextHint::None, CostKind);
+      // Compare float vectors.
+      Cost += getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred, CostKind,
+                                 Op1Info, Op2Info);
+      // During codegen we'll truncate the vector result from i32 to i16.
+      Cost +=
+          getCastInstrCost(Instruction::Trunc, VectorType::getInteger(ValVTy),
+                           VectorType::getInteger(PromotedTy),
+                           TTI::CastContextHint::None, CostKind);
+      return Cost;
+    }
   }
 
   // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index c2256159a8ee2..e66f94dd54f21 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -168,7 +168,7 @@ define <2 x double> @v2f64_select_ogt(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_ogt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ogt'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -255,7 +255,7 @@ define <2 x double> @v2f64_select_oge(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_oge(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_oge'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -342,7 +342,7 @@ define <2 x double> @v2f64_select_olt(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_olt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_olt'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -429,7 +429,7 @@ define <2 x double> @v2f64_select_ole(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_ole(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ole'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -516,7 +516,7 @@ define <2 x double> @v2f64_select_oeq(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_oeq(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_oeq'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -603,7 +603,7 @@ define <2 x double> @v2f64_select_one(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_one(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_one'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -690,7 +690,7 @@ define <2 x double> @v2f64_select_une(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_une(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_une'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
@@ -777,7 +777,7 @@ define <2 x double> @v2f64_select_ord(<2 x double> %a, <2 x double> %b, <2 x dou
 
 define <4 x bfloat> @v4bf16_select_ord(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ord'
-; COST-NEXT:  Cost Model: Found costs of 1 for: %cmp.1 = fcmp ord <4 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ord <4 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;

>From cc5975244f12e150c15b33afaa9e081a5afca01b Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 11 Apr 2025 16:17:55 +0100
Subject: [PATCH 2/2] [LLVM][CodeGen][AArch64] Don't scalarise v8{f16,bf16}
 vsetcc operations.

I have also removed custom promotion code for the v4{f16,bf16} cases
because the same common code can be used.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  36 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   5 -
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   |   4 +-
 .../CostModel/AArch64/vector-select.ll        |  32 +-
 .../CodeGen/AArch64/bf16-v8-instructions.ll   | 901 +++--------------
 llvm/test/CodeGen/AArch64/fcmp.ll             | 582 ++---------
 .../CodeGen/AArch64/fp16-v8-instructions.ll   | 949 +++---------------
 7 files changed, 442 insertions(+), 2067 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 830ec6886e6bc..dcbaf3a5ec7da 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -840,11 +840,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::SETCC,         V4Narrow, MVT::v4f32);
 
     setOperationAction(ISD::FABS,        V4Narrow, Legal);
-    setOperationAction(ISD::FNEG, 	 V4Narrow, Legal);
+    setOperationAction(ISD::FNEG,        V4Narrow, Legal);
     setOperationAction(ISD::FMA,         V4Narrow, Expand);
-    setOperationAction(ISD::SETCC,       V4Narrow, Custom);
     setOperationAction(ISD::BR_CC,       V4Narrow, Expand);
     setOperationAction(ISD::SELECT,      V4Narrow, Expand);
     setOperationAction(ISD::SELECT_CC,   V4Narrow, Expand);
@@ -852,6 +852,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT,       V4Narrow, Expand);
 
     auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
+    setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
+    setOperationPromotedToType(ISD::SETCC,         V8Narrow, MVT::v8f32);
+
     setOperationAction(ISD::FABS,        V8Narrow, Legal);
     setOperationAction(ISD::FADD,        V8Narrow, Legal);
     setOperationAction(ISD::FCEIL,       V8Narrow, Legal);
@@ -861,19 +864,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA,         V8Narrow, Expand);
     setOperationAction(ISD::FMUL,        V8Narrow, Legal);
     setOperationAction(ISD::FNEARBYINT,  V8Narrow, Legal);
-    setOperationAction(ISD::FNEG, 	 V8Narrow, Legal);
+    setOperationAction(ISD::FNEG,        V8Narrow, Legal);
     setOperationAction(ISD::FROUND,      V8Narrow, Legal);
     setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Legal);
     setOperationAction(ISD::FRINT,       V8Narrow, Legal);
     setOperationAction(ISD::FSQRT,       V8Narrow, Expand);
     setOperationAction(ISD::FSUB,        V8Narrow, Legal);
     setOperationAction(ISD::FTRUNC,      V8Narrow, Legal);
-    setOperationAction(ISD::SETCC,       V8Narrow, Expand);
     setOperationAction(ISD::BR_CC,       V8Narrow, Expand);
     setOperationAction(ISD::SELECT,      V8Narrow, Expand);
     setOperationAction(ISD::SELECT_CC,   V8Narrow, Expand);
     setOperationAction(ISD::FP_EXTEND,   V8Narrow, Expand);
-    setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
   };
 
   if (!Subtarget->hasFullFP16()) {
@@ -15898,6 +15899,11 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   if (LHS.getValueType().getVectorElementType().isInteger())
     return Op;
 
+  assert((!Subtarget->hasFullFP16() &&
+          LHS.getValueType().getVectorElementType() != MVT::f16) ||
+         LHS.getValueType().getVectorElementType() != MVT::bf16 ||
+         LHS.getValueType().getVectorElementType() != MVT::f128);
+
   // Lower isnan(x) | isnan(never-nan) to x != x.
   // Lower !isnan(x) & !isnan(never-nan) to x == x.
   if (CC == ISD::SETUO || CC == ISD::SETO) {
@@ -15916,26 +15922,6 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
     }
   }
 
-  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
-
-  // Make v4f16 (only) fcmp operations utilise vector instructions
-  // v8f16 support will be a litle more complicated
-  if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
-      LHS.getValueType().getVectorElementType() == MVT::bf16) {
-    if (LHS.getValueType().getVectorNumElements() == 4) {
-      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
-      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
-      SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
-      DAG.ReplaceAllUsesWith(Op, NewSetcc);
-      CmpVT = MVT::v4i32;
-    } else
-      return SDValue();
-  }
-
-  assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
-         LHS.getValueType().getVectorElementType() != MVT::bf16 ||
-         LHS.getValueType().getVectorElementType() != MVT::f128);
-
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
   AArch64CC::CondCode CC1, CC2;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f79b8277b4cd1..abb5b726ea55c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4241,11 +4241,6 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
         ValScalarTy->isBFloatTy()) {
       auto *ValVTy = cast<FixedVectorType>(ValTy);
 
-      // FIXME: We currently scalarise these.
-      if (ValVTy->getNumElements() > 4)
-        return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
-                                         CostKind, Op1Info, Op2Info, I);
-
       // Without dedicated instructions we promote [b]f16 compares to f32.
       auto *PromotedTy =
           VectorType::get(Type::getFloatTy(ValTy->getContext()), ValVTy);
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index f49d9be6b16b7..aba113865af10 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -17,10 +17,10 @@ define void @cmps() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cf32 = fcmp ogt float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cf64 = fcmp ogt double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cbf64 = fcmp ogt bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cfv816 = fcmp olt <8 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cfv432 = fcmp oge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cfv264 = fcmp oge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cbfv816 = fcmp olt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cbfv816 = fcmp olt <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c8 = icmp slt i8 undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index e66f94dd54f21..d568f4b2c0b5c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -119,7 +119,7 @@ define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ogt'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -179,7 +179,7 @@ define <4 x bfloat> @v4bf16_select_ogt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ogt(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ogt'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -206,7 +206,7 @@ define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_oge'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -266,7 +266,7 @@ define <4 x bfloat> @v4bf16_select_oge(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_oge(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_oge'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -293,7 +293,7 @@ define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_olt'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -353,7 +353,7 @@ define <4 x bfloat> @v4bf16_select_olt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_olt(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_olt'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -380,7 +380,7 @@ define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ole'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -440,7 +440,7 @@ define <4 x bfloat> @v4bf16_select_ole(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ole(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ole'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -467,7 +467,7 @@ define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_oeq'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -527,7 +527,7 @@ define <4 x bfloat> @v4bf16_select_oeq(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_oeq(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_oeq'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -554,7 +554,7 @@ define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_one'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -614,7 +614,7 @@ define <4 x bfloat> @v4bf16_select_one(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_one(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_one'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -641,7 +641,7 @@ define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_une'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -701,7 +701,7 @@ define <4 x bfloat> @v4bf16_select_une(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_une(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_une'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
@@ -728,7 +728,7 @@ define <4 x half> @v4f16_select_ord(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ord(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ord'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ord <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ord <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -788,7 +788,7 @@ define <4 x bfloat> @v4bf16_select_ord(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ord(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ord'
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ord <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ord <8 x bfloat> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 3a55b68f2d1a3..715693dd6ed07 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -882,61 +882,17 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, ne
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp une <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -945,69 +901,21 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, eq
-; CHECK-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ueq <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1016,61 +924,17 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, hi
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v1.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ugt <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1079,61 +943,17 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, pl
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v1.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp uge <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1142,61 +962,17 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, lt
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ult <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1205,61 +981,17 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, le
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ule <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1268,61 +1000,21 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, vs
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp uno <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1331,69 +1023,20 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, mi
-; CHECK-NEXT:    csinv w9, w9, wzr, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp one <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1402,61 +1045,16 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, eq
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp oeq <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1465,61 +1063,16 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, gt
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ogt <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1528,61 +1081,16 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, ge
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp oge <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1591,61 +1099,16 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, mi
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v1.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp olt <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1654,61 +1117,16 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, ls
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v1.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ole <8 x bfloat> %a, %b
   ret <8 x i1> %1
@@ -1717,61 +1135,20 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v2.4h, v1.h[1]
-; CHECK-NEXT:    dup v3.4h, v0.h[1]
-; CHECK-NEXT:    dup v4.4h, v1.h[2]
-; CHECK-NEXT:    dup v5.4h, v0.h[2]
-; CHECK-NEXT:    dup v6.4h, v0.h[3]
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v2.4s, v1.4h, #16
-; CHECK-NEXT:    shll v3.4s, v0.4h, #16
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll v4.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.4h, v1.h[3]
-; CHECK-NEXT:    csetm w9, vc
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[4]
-; CHECK-NEXT:    dup v6.8h, v0.h[4]
-; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[5]
-; CHECK-NEXT:    dup v6.8h, v0.h[5]
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
-; CHECK-NEXT:    dup v5.8h, v1.h[6]
-; CHECK-NEXT:    dup v6.8h, v0.h[6]
-; CHECK-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    shll v3.4s, v5.4h, #16
-; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v3.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    mov v2.h[5], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    mov v2.h[7], w8
-; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %1 = fcmp ord <8 x bfloat> %a, %b
   ret <8 x i1> %1
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 66f26fc9d8597..fa0cb46e16bb3 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1138,61 +1138,15 @@ entry:
 define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x half> %e) {
 ; CHECK-SD-NOFP16-LABEL: v7f16_half:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s4, s7
-; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcmp s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], w8
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -1314,61 +1268,15 @@ entry:
 define <8 x half> @v8f16_half(<8 x half> %a, <8 x half> %b, <8 x half> %d, <8 x half> %e) {
 ; CHECK-SD-NOFP16-LABEL: v8f16_half:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s4, s7
-; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcmp s16, s7
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], w8
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v4.4s, v5.4s, v4.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -1406,116 +1314,25 @@ entry:
 define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d, <16 x half> %e) {
 ; CHECK-SD-NOFP16-LABEL: v16f16_half:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
-; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s1, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    csetm w15, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    csetm w16, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
-; CHECK-SD-NOFP16-NEXT:    fmov s1, w14
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmov s3, w14
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], w16
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], w13
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], w11
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], w12
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], w10
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], w9
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], w15
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v16.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v17.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v18.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v19.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v16.4s, v17.4s, v16.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v3.4s, v19.4s, v18.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    uzp1 v1.8h, v1.8h, v16.8h
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; CHECK-SD-NOFP16-NEXT:    shl v1.8h, v1.8h, #15
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
-; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
 ; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -1567,89 +1384,44 @@ entry:
 define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32> %e) {
 ; CHECK-SD-NOFP16-LABEL: v7f16_i32:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h5
-; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    add x9, sp, #8
-; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
-; CHECK-SD-NOFP16-NEXT:    fmov s5, w13
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v3.4s, v1.8h
 ; CHECK-SD-NOFP16-NEXT:    mov x8, sp
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    ldr s4, [sp, #24]
+; CHECK-SD-NOFP16-NEXT:    add x9, sp, #32
+; CHECK-SD-NOFP16-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-SD-NOFP16-NEXT:    add x9, sp, #16
+; CHECK-SD-NOFP16-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-SD-NOFP16-NEXT:    fmov s3, w4
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fmov s1, w0
+; CHECK-SD-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-SD-NOFP16-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-SD-NOFP16-NEXT:    fmov s2, w7
-; CHECK-SD-NOFP16-NEXT:    fmov s3, w0
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[1], w12
+; CHECK-SD-NOFP16-NEXT:    mov v3.s[2], w6
 ; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[1], [x8]
-; CHECK-SD-NOFP16-NEXT:    mov v3.s[1], w1
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w10
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-SD-NOFP16-NEXT:    fmov s1, w4
-; CHECK-SD-NOFP16-NEXT:    ldr s0, [sp, #24]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[2], [x9]
-; CHECK-SD-NOFP16-NEXT:    add x9, sp, #32
-; CHECK-SD-NOFP16-NEXT:    mov v3.s[2], w2
-; CHECK-SD-NOFP16-NEXT:    mov v1.s[1], w5
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w11
-; CHECK-SD-NOFP16-NEXT:    ld1 { v0.s }[1], [x9]
-; CHECK-SD-NOFP16-NEXT:    mov v5.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    add x8, sp, #16
-; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[3], [x8]
-; CHECK-SD-NOFP16-NEXT:    mov v3.s[3], w3
+; CHECK-SD-NOFP16-NEXT:    mov v1.s[2], w2
+; CHECK-SD-NOFP16-NEXT:    add x8, sp, #8
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[2], [x8]
 ; CHECK-SD-NOFP16-NEXT:    add x8, sp, #40
-; CHECK-SD-NOFP16-NEXT:    mov v1.s[2], w6
-; CHECK-SD-NOFP16-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-SD-NOFP16-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-SD-NOFP16-NEXT:    sshll v5.4s, v5.4h, #0
-; CHECK-SD-NOFP16-NEXT:    bit v2.16b, v3.16b, v4.16b
-; CHECK-SD-NOFP16-NEXT:    bit v0.16b, v1.16b, v5.16b
-; CHECK-SD-NOFP16-NEXT:    mov w1, v2.s[1]
-; CHECK-SD-NOFP16-NEXT:    mov w2, v2.s[2]
-; CHECK-SD-NOFP16-NEXT:    mov w3, v2.s[3]
-; CHECK-SD-NOFP16-NEXT:    fmov w0, s2
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    mov v1.s[3], w3
+; CHECK-SD-NOFP16-NEXT:    ld1 { v4.s }[2], [x8]
+; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[3], [x9]
+; CHECK-SD-NOFP16-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    bif v1.16b, v2.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v3.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    mov w1, v1.s[1]
+; CHECK-SD-NOFP16-NEXT:    mov w2, v1.s[2]
+; CHECK-SD-NOFP16-NEXT:    mov w3, v1.s[3]
 ; CHECK-SD-NOFP16-NEXT:    mov w5, v0.s[1]
 ; CHECK-SD-NOFP16-NEXT:    mov w6, v0.s[2]
+; CHECK-SD-NOFP16-NEXT:    fmov w0, s1
 ; CHECK-SD-NOFP16-NEXT:    fmov w4, s0
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
@@ -1858,62 +1630,19 @@ entry:
 define <8 x i32> @v8f16_i32(<8 x half> %a, <8 x half> %b, <8 x i32> %d, <8 x i32> %e) {
 ; CHECK-SD-NOFP16-LABEL: v8f16_i32:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
-; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-SD-NOFP16-NEXT:    fmov s16, w9
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
-; CHECK-SD-NOFP16-NEXT:    fmov s17, w13
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[1], w8
-; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[1], w12
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[2], w10
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[3], w11
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    sshll v1.4s, v16.4h, #0
-; CHECK-SD-NOFP16-NEXT:    sshll v0.4s, v17.4h, #0
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v6.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v7.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v6.4s, v7.4s, v6.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v6.8h
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v6.16b
 ; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
 ; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
 ; CHECK-SD-NOFP16-NEXT:    ret
@@ -1965,126 +1694,37 @@ entry:
 define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16 x i32> %e) {
 ; CHECK-SD-NOFP16-LABEL: v16f16_i32:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s1, s3
-; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    csetm w15, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    csetm w16, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-SD-NOFP16-NEXT:    csetm w17, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h0
-; CHECK-SD-NOFP16-NEXT:    csetm w18, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
-; CHECK-SD-NOFP16-NEXT:    fmov s18, w14
-; CHECK-SD-NOFP16-NEXT:    fmov s19, w17
-; CHECK-SD-NOFP16-NEXT:    csetm w0, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
-; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    mov v18.h[1], w12
-; CHECK-SD-NOFP16-NEXT:    mov v19.h[1], w16
-; CHECK-SD-NOFP16-NEXT:    csetm w1, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
-; CHECK-SD-NOFP16-NEXT:    fmov s16, w10
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    csetm w2, mi
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[1], w8
-; CHECK-SD-NOFP16-NEXT:    mov v18.h[2], w13
-; CHECK-SD-NOFP16-NEXT:    fmov s17, w2
-; CHECK-SD-NOFP16-NEXT:    mov v19.h[2], w18
-; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[1], w1
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[2], w9
-; CHECK-SD-NOFP16-NEXT:    mov v18.h[3], w15
-; CHECK-SD-NOFP16-NEXT:    mov v19.h[3], w0
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[2], w8
-; CHECK-SD-NOFP16-NEXT:    mov v16.h[3], w11
-; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
-; CHECK-SD-NOFP16-NEXT:    mov v17.h[3], w8
-; CHECK-SD-NOFP16-NEXT:    sshll v2.4s, v16.4h, #0
-; CHECK-SD-NOFP16-NEXT:    sshll v16.4s, v18.4h, #0
-; CHECK-SD-NOFP16-NEXT:    ldp q0, q18, [sp]
-; CHECK-SD-NOFP16-NEXT:    sshll v1.4s, v17.4h, #0
-; CHECK-SD-NOFP16-NEXT:    sshll v17.4s, v19.4h, #0
-; CHECK-SD-NOFP16-NEXT:    ldp q19, q3, [sp, #32]
-; CHECK-SD-NOFP16-NEXT:    bit v0.16b, v4.16b, v1.16b
-; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v17.16b
-; CHECK-SD-NOFP16-NEXT:    bit v3.16b, v7.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v17.4s, v0.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v18.4s, v2.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v16.4s, v1.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl2 v19.4s, v3.8h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v17.4s, v18.4s, v17.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v2.4s, v19.4s, v16.4s
+; CHECK-SD-NOFP16-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-SD-NOFP16-NEXT:    ldp q18, q19, [sp, #32]
+; CHECK-SD-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v17.8h
+; CHECK-SD-NOFP16-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-SD-NOFP16-NEXT:    ldp q2, q20, [sp]
+; CHECK-SD-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-SD-NOFP16-NEXT:    shl v1.8h, v1.8h, #15
+; CHECK-SD-NOFP16-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v16.4s, v1.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll2 v17.4s, v1.8h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    mov v3.16b, v17.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v5.16b, v20.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v4.16b, v2.16b
 ; CHECK-SD-NOFP16-NEXT:    mov v2.16b, v16.16b
-; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v5.16b, v18.16b
-; CHECK-SD-NOFP16-NEXT:    bsl v2.16b, v6.16b, v19.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: v16f16_i32:
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index d4130e7a848b1..34788827d3075 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -514,61 +514,17 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
 define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_une:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, ne
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_une:
@@ -584,69 +540,21 @@ define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ueq:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s6, h0
-; CHECK-CVT-NEXT:    mov h5, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s6, s4
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s2, s3
-; CHECK-CVT-NEXT:    mov h2, v0.h[4]
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    fmov s5, w9
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v5.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    mov v5.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s2, s6
-; CHECK-CVT-NEXT:    fcvt s2, h3
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    mov h4, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v5.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    fcvt s2, h4
-; CHECK-CVT-NEXT:    fcvt s3, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v5.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov v5.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v5.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    mov v5.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v5.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
@@ -664,61 +572,17 @@ define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ugt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, hi
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ugt:
@@ -734,61 +598,17 @@ define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_uge:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, pl
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uge:
@@ -804,61 +624,17 @@ define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ult:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, lt
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ult:
@@ -874,61 +650,17 @@ define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ule:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, le
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ule:
@@ -944,61 +676,21 @@ define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_uno:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, vs
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uno:
@@ -1016,69 +708,20 @@ define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_one:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s6, h0
-; CHECK-CVT-NEXT:    mov h5, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s6, s4
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    csinv w9, w9, wzr, le
-; CHECK-CVT-NEXT:    fcmp s2, s3
-; CHECK-CVT-NEXT:    mov h2, v0.h[4]
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    fmov s5, w9
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v5.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    mov v5.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s2, s6
-; CHECK-CVT-NEXT:    fcvt s2, h3
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    mov h4, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v5.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    fcvt s2, h4
-; CHECK-CVT-NEXT:    fcvt s3, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v5.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov v5.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v5.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    mov v5.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v5.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
@@ -1095,61 +738,16 @@ define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_oeq:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oeq:
@@ -1164,61 +762,16 @@ define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ogt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, gt
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ogt:
@@ -1233,61 +786,16 @@ define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_oge:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, ge
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oge:
@@ -1302,61 +810,16 @@ define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_olt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_olt:
@@ -1371,61 +834,16 @@ define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ole:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, ls
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ole:
@@ -1440,61 +858,20 @@ define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
 define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-LABEL: test_fcmp_ord:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    mov h2, v1.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    mov h3, v1.h[3]
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v0.h[3]
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    csetm w9, vc
-; CHECK-CVT-NEXT:    fcmp s2, s5
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    mov v2.h[1], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
-; CHECK-CVT-NEXT:    mov h4, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
-; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    mov v2.h[5], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v2.h[6], w8
-; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    mov v2.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v2.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-CVT-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ord:



More information about the llvm-commits mailing list