[llvm] [AArch64][GlobalISel] More FCmp legalization. (PR #78734)

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 28 03:32:52 PST 2024


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/78734

>From e0b25f2ff946ef0cab55e13096bc53d27463b594 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sun, 28 Jan 2024 11:32:36 +0000
Subject: [PATCH] [AArch64][GlobalISel] More FCmp legalization.

This fills out the fcmp handling to be more like the other instructions, adding
better support for fp16 and some larger vectors.

Select of f16 values is still not handled optimally in places as the select is
only legal for s32 values, not s16. This would be correct for integer but not
necessarily for fp. It is as if we need to do legalization -> regbankselect ->
extra legaliation -> selection.

There is some fallback due to some missing handling for shifts, which I think
Chuong is currently looking at.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   23 +-
 .../GISel/AArch64InstructionSelector.cpp      |   16 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   43 +-
 .../AArch64/GlobalISel/legalize-fcmp.mir      |   83 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             | 1199 +++++++----------
 5 files changed, 526 insertions(+), 838 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 4b675e8da691c26..6c06afa0979e5bb 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1556,6 +1556,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_FCMP:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+
   case TargetOpcode::G_SEXT_INREG: {
     if (TypeIdx != 0)
       return UnableToLegalize;
@@ -5317,14 +5326,18 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     Observer.changedInstr(MI);
     return Legalized;
   }
-  case TargetOpcode::G_ICMP: {
-    // TODO: the symmetric MoreTy works for targets like, e.g. NEON.
-    // For targets, like e.g. MVE, the result is a predicated vector (i1).
-    // This will need some refactoring.
+  case TargetOpcode::G_ICMP:
+  case TargetOpcode::G_FCMP: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
     Observer.changingInstr(MI);
     moreElementsVectorSrc(MI, MoreTy, 2);
     moreElementsVectorSrc(MI, MoreTy, 3);
-    moreElementsVectorDst(MI, MoreTy, 0);
+    LLT CondTy = LLT::fixed_vector(
+        MoreTy.getNumElements(),
+        MRI.getType(MI.getOperand(0).getReg()).getElementType());
+    moreElementsVectorDst(MI, CondTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 8344e79f78e1eb6..48cfb60210d965d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -4600,8 +4600,7 @@ MachineInstr *AArch64InstructionSelector::emitFPCompare(
   if (Ty.isVector())
     return nullptr;
   unsigned OpSize = Ty.getSizeInBits();
-  if (OpSize != 32 && OpSize != 64)
-    return nullptr;
+  assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
 
   // If this is a compare against +0.0, then we don't have
   // to explicitly materialize a constant.
@@ -4620,9 +4619,11 @@ MachineInstr *AArch64InstructionSelector::emitFPCompare(
       std::swap(LHS, RHS);
     }
   }
-  unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
-                              {AArch64::FCMPSri, AArch64::FCMPDri}};
-  unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
+  unsigned CmpOpcTbl[2][3] = {
+      {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
+      {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
+  unsigned CmpOpc =
+      CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
 
   // Partially build the compare. Decide if we need to add a use for the
   // third operand based off whether or not we're comparing against 0.0.
@@ -4889,18 +4890,21 @@ MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
   // TODO: emit CMN as an optimization.
   auto &MRI = *MIB.getMRI();
   LLT OpTy = MRI.getType(LHS);
-  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
   unsigned CCmpOpc;
   std::optional<ValueAndVReg> C;
   if (CmpInst::isIntPredicate(CC)) {
+    assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
     C = getIConstantVRegValWithLookThrough(RHS, MRI);
     if (C && C->Value.ult(32))
       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
     else
       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
   } else {
+    assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
+           OpTy.getSizeInBits() == 64);
     switch (OpTy.getSizeInBits()) {
     case 16:
+      assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
       CCmpOpc = AArch64::FCCMPHrr;
       break;
     case 32:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fd69a7d6c33d032..500db122ca50e75 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -527,39 +527,26 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalarOrEltIf(
           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
           s64)
-      .moreElementsToNextPow2(0)
-      .clampNumElements(0, v8s8, v16s8)
-      .clampNumElements(0, v4s16, v8s16)
-      .clampNumElements(0, v2s32, v4s32)
-      .clampNumElements(0, v2s64, v2s64);
+      .moreElementsToNextPow2(1)
+      .clampNumElements(1, v8s8, v16s8)
+      .clampNumElements(1, v4s16, v8s16)
+      .clampNumElements(1, v2s32, v4s32)
+      .clampNumElements(1, v2s64, v2s64);
 
   getActionDefinitionsBuilder(G_FCMP)
-      // If we don't have full FP16 support, then scalarize the elements of
-      // vectors containing fp16 types.
-      .fewerElementsIf(
-          [=](const LegalityQuery &Query) {
-            const auto &Ty = Query.Types[0];
-            return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16;
-          },
-          [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
-      // If we don't have full FP16 support, then widen s16 to s32 if we
-      // encounter it.
-      .widenScalarIf(
-          [=](const LegalityQuery &Query) {
-            return Query.Types[0] == s16 && !HasFP16;
-          },
-          [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
-      .legalFor({{s16, s16},
+      .legalFor({{s32, MinFPScalar},
                  {s32, s32},
                  {s32, s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32},
-                 {v2s64, v2s64},
-                 {v4s16, v4s16},
-                 {v8s16, v8s16}})
+                 {v2s64, v2s64}})
+      .legalIf([=](const LegalityQuery &Query) {
+        const auto &Ty = Query.Types[1];
+        return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
+      })
       .widenScalarOrEltToNextPow2(1)
-      .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
+      .clampScalarOrElt(1, MinFPScalar, s64)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -568,8 +555,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Ty.getElementType() != SrcTy.getElementType();
           },
           0, 1)
-      .clampNumElements(0, v2s32, v4s32)
-      .clampMaxNumElements(1, s64, 2);
+      .clampNumElements(1, v4s16, v8s16)
+      .clampNumElements(1, v2s32, v4s32)
+      .clampMaxNumElements(1, s64, 2)
+      .moreElementsToNextPow2(1);
 
   // Extensions
   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
index 355cf193272e654..b7c86f235267e6d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
@@ -6,13 +6,13 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oge), [[COPY]](s64), [[COPY1]]
-    ; CHECK: $w0 = COPY [[FCMP]](s32)
-    ; CHECK: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
-    ; CHECK: $w0 = COPY [[FCMP1]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oge), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY [[FCMP]](s32)
+    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
+    ; CHECK-NEXT: $w0 = COPY [[FCMP1]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x0
     %2:_(s32) = G_TRUNC %0(s64)
@@ -36,41 +36,17 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lhs:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: %rhs:_(<8 x s16>) = COPY $q1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %lhs(<8 x s16>)
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %rhs(<8 x s16>)
-    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
-    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV8]](s16)
-    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT]](s32), [[FPEXT1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP]](s32)
-    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
-    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV9]](s16)
-    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT2]](s32), [[FPEXT3]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP1]](s32)
-    ; CHECK-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16)
-    ; CHECK-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV10]](s16)
-    ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT4]](s32), [[FPEXT5]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP2]](s32)
-    ; CHECK-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16)
-    ; CHECK-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[UV11]](s16)
-    ; CHECK-NEXT: [[FCMP3:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT6]](s32), [[FPEXT7]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP3]](s32)
-    ; CHECK-NEXT: [[FPEXT8:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16)
-    ; CHECK-NEXT: [[FPEXT9:%[0-9]+]]:_(s32) = G_FPEXT [[UV12]](s16)
-    ; CHECK-NEXT: [[FCMP4:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT8]](s32), [[FPEXT9]]
-    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP4]](s32)
-    ; CHECK-NEXT: [[FPEXT10:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16)
-    ; CHECK-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[UV13]](s16)
-    ; CHECK-NEXT: [[FCMP5:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT10]](s32), [[FPEXT11]]
-    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP5]](s32)
-    ; CHECK-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16)
-    ; CHECK-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[UV14]](s16)
-    ; CHECK-NEXT: [[FCMP6:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT12]](s32), [[FPEXT13]]
-    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP6]](s32)
-    ; CHECK-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[UV7]](s16)
-    ; CHECK-NEXT: [[FPEXT15:%[0-9]+]]:_(s32) = G_FPEXT [[UV15]](s16)
-    ; CHECK-NEXT: [[FCMP7:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT14]](s32), [[FPEXT15]]
-    ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP7]](s32)
-    ; CHECK-NEXT: %fcmp:_(<8 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES %lhs(<8 x s16>)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV]](<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV1]](<4 x s16>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s16>), [[UV3:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES %rhs(<8 x s16>)
+    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV2]](<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(<4 x s32>) = G_FPEXT [[UV3]](<4 x s16>)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT]](<4 x s32>), [[FPEXT2]]
+    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT1]](<4 x s32>), [[FPEXT3]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[FCMP]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[FCMP1]](<4 x s32>)
+    ; CHECK-NEXT: %fcmp:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
     ; CHECK-NEXT: $q0 = COPY %fcmp(<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lhs:_(<8 x s16>) = COPY $q0
@@ -93,25 +69,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lhs:_(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: %rhs:_(<4 x s16>) = COPY $d1
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %lhs(<4 x s16>)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES %rhs(<4 x s16>)
-    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
-    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16)
-    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT]](s32), [[FPEXT1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP]](s32)
-    ; CHECK-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
-    ; CHECK-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16)
-    ; CHECK-NEXT: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT2]](s32), [[FPEXT3]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP1]](s32)
-    ; CHECK-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16)
-    ; CHECK-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16)
-    ; CHECK-NEXT: [[FCMP2:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT4]](s32), [[FPEXT5]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP2]](s32)
-    ; CHECK-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16)
-    ; CHECK-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[UV7]](s16)
-    ; CHECK-NEXT: [[FCMP3:%[0-9]+]]:_(s32) = G_FCMP floatpred(oeq), [[FPEXT6]](s32), [[FPEXT7]]
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[FCMP3]](s32)
-    ; CHECK-NEXT: %fcmp:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(<4 x s32>) = G_FPEXT %lhs(<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(<4 x s32>) = G_FPEXT %rhs(<4 x s16>)
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(<4 x s32>) = G_FCMP floatpred(oeq), [[FPEXT]](<4 x s32>), [[FPEXT1]]
+    ; CHECK-NEXT: %fcmp:_(<4 x s16>) = G_TRUNC [[FCMP]](<4 x s32>)
     ; CHECK-NEXT: $d0 = COPY %fcmp(<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %lhs:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index d3bc7fc6dc0634b..29138ba59cfe41b 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -4,15 +4,6 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
 ; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v3f64_double
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f64_i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_float
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_i32
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v7f16_half
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_half
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v7f16_i32
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_i32
-
 define double @f64_double(double %a, double %b, double %d, double %e) {
 ; CHECK-LABEL: f64_double:
 ; CHECK:       // %bb.0: // %entry
@@ -79,19 +70,31 @@ define half @f16_half(half %a, half %b, half %d, half %e) {
 ; CHECK-SD-FP16-NEXT:    fcsel h0, h2, h3, mi
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-LABEL: f16_half:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-GI-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    fcmp s0, s1
-; CHECK-GI-NEXT:    csel w8, w8, w9, mi
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-GI-NEXT:    ret
+; CHECK-GI-NOFP16-LABEL: f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s2
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s3
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    csel w8, w8, w9, mi
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-GI-FP16-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-GI-FP16-NEXT:    fcmp h0, h1
+; CHECK-GI-FP16-NEXT:    fmov w8, s2
+; CHECK-GI-FP16-NEXT:    fmov w9, s3
+; CHECK-GI-FP16-NEXT:    csel w8, w8, w9, mi
+; CHECK-GI-FP16-NEXT:    fmov s0, w8
+; CHECK-GI-FP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt half %a, %b
   %s = select i1 %c, half %d, half %e
@@ -113,13 +116,19 @@ define i32 @f16_i32(half %a, half %b, i32 %d, i32 %e) {
 ; CHECK-SD-FP16-NEXT:    csel w0, w0, w1, mi
 ; CHECK-SD-FP16-NEXT:    ret
 ;
-; CHECK-GI-LABEL: f16_i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    fcmp s0, s1
-; CHECK-GI-NEXT:    csel w0, w0, w1, mi
-; CHECK-GI-NEXT:    ret
+; CHECK-GI-NOFP16-LABEL: f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmp h0, h1
+; CHECK-GI-FP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt half %a, %b
   %s = select i1 %c, i32 %d, i32 %e
@@ -139,32 +148,60 @@ entry:
 }
 
 define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> %d, <3 x double> %e) {
-; CHECK-LABEL: v3f64_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
-; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-NEXT:    ldr d16, [sp, #24]
-; CHECK-NEXT:    ldr d17, [sp]
-; CHECK-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    ldp d1, d4, [sp, #8]
-; CHECK-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
-; CHECK-NEXT:    mov v1.d[1], v4.d[0]
-; CHECK-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    bsl v2.16b, v17.16b, v16.16b
-; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v1.16b
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f64_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-SD-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-SD-NEXT:    ldr d16, [sp, #24]
+; CHECK-SD-NEXT:    ldr d17, [sp]
+; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-SD-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-SD-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
+; CHECK-SD-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-SD-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-SD-NEXT:    bsl v2.16b, v17.16b, v16.16b
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f64_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-GI-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-GI-NEXT:    fcmp d2, d5
+; CHECK-GI-NEXT:    ldr x8, [sp]
+; CHECK-GI-NEXT:    ldr x10, [sp, #24]
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-GI-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-GI-NEXT:    cset w9, mi
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-GI-NEXT:    and x8, x8, x9
+; CHECK-GI-NEXT:    bic x9, x10, x9
+; CHECK-GI-NEXT:    orr x8, x8, x9
+; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x double> %a, %b
   %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e
@@ -207,21 +244,54 @@ entry:
 }
 
 define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3f64_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-NEXT:    fcmgt v1.2d, v5.2d, v2.2d
-; CHECK-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    bsl v0.16b, v6.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f64_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-SD-NEXT:    fcmgt v1.2d, v5.2d, v2.2d
+; CHECK-SD-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    bsl v0.16b, v6.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f64_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    fcmp d2, d5
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-GI-NEXT:    fmov s16, w8
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    cset w9, mi
+; CHECK-GI-NEXT:    mov v16.s[1], w8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    mov v16.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v16.s[3], w8
+; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    neg v1.4s, v16.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v16.4s
+; CHECK-GI-NEXT:    mov v2.s[3], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x double> %a, %b
   %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
@@ -265,11 +335,33 @@ entry:
 }
 
 define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, <3 x float> %e) {
-; CHECK-LABEL: v3f32_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f32_float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f32_float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x float> %a, %b
   %s = select <3 x i1> %c, <3 x float> %d, <3 x float> %e
@@ -323,11 +415,33 @@ entry:
 }
 
 define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3f32_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f32_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f32_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x float> %a, %b
   %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
@@ -437,84 +551,49 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov w9, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h20, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w9
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s5
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w9
-; CHECK-GI-NOFP16-NEXT:    mov v4.16b, v6.16b
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w8
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov v16.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmp s17, s18
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v18.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[5], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    neg v0.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    ushl v1.8h, v5.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[6], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v1.8h, v0.8h
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v4.8h
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -522,8 +601,34 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    fmov s4, w8
+; CHECK-GI-FP16-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-FP16-NEXT:    fmov s6, w8
+; CHECK-GI-FP16-NEXT:    mov v5.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    mov v7.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[1], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[2], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[3], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[4], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[7], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-FP16-NEXT:    mov v7.h[7], v0.h[0]
+; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <7 x half> %a, %b
@@ -549,39 +654,10 @@ define <4 x half> @v4f16_half(<4 x half> %a, <4 x half> %b, <4 x half> %d, <4 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v4f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v4.4h, #15
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #15
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -665,68 +741,14 @@ define <8 x half> @v8f16_half(<8 x half> %a, <8 x half> %b, <8 x half> %d, <8 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v8f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w9
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s17
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w8
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s16
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v4.8h, #15
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v4.4s, v5.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v4.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -868,130 +890,22 @@ define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d,
 ;
 ; CHECK-GI-NOFP16-LABEL: v16f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w15, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    cset w16, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w16
-; CHECK-GI-NOFP16-NEXT:    cset w17, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s19
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h20
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
-; CHECK-GI-NOFP16-NEXT:    fmov s20, w15
-; CHECK-GI-NOFP16-NEXT:    fmov s21, w17
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v20.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v21.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h20, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h21
-; CHECK-GI-NOFP16-NEXT:    fmov s20, w12
-; CHECK-GI-NOFP16-NEXT:    fmov s21, w13
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v20.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v21.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w11
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[4], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h3
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w11
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[5], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmp s1, s2
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w9
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[6], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[7], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v16.4s, v18.4s, v16.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v19.4s, v17.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v16.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-GI-NOFP16-NEXT:    shl v1.8h, v18.8h, #15
+; CHECK-GI-NOFP16-NEXT:    shl v1.8h, v1.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    sshr v1.8h, v1.8h, #15
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
@@ -1000,10 +914,10 @@ define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d,
 ;
 ; CHECK-GI-FP16-LABEL: v16f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <16 x half> %a, %b
@@ -1137,104 +1051,72 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov w13, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h1
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s3
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h6
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h7
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w4
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s2, s3
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w13
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w12
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[1], w13
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w9
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NOFP16-NEXT:    ldr s1, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[2], w13
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w10
-; CHECK-GI-NOFP16-NEXT:    fmov w10, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v6.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w11
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w10
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w4
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp]
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w7
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    ldr s2, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s16
+; CHECK-GI-NOFP16-NEXT:    fcmgt v5.4s, v5.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w0
+; CHECK-GI-NOFP16-NEXT:    neg v19.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[2], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    neg v18.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s3, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    ushl v4.4s, v5.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s5
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    sshl v4.4s, v4.4s, v19.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s3
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    eor v3.16b, v2.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    and v2.16b, v7.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
-; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v4.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    and v3.16b, v17.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v6.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
 ; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
 ; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
@@ -1245,37 +1127,78 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fmov s2, w0
-; CHECK-GI-FP16-NEXT:    fmov s3, w7
-; CHECK-GI-FP16-NEXT:    mov x8, sp
+; CHECK-GI-FP16-NEXT:    fcmgt v5.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s2, w10
+; CHECK-GI-FP16-NEXT:    ldr s1, [sp, #24]
+; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #32]
+; CHECK-GI-FP16-NEXT:    fmov s16, w0
+; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #8]
+; CHECK-GI-FP16-NEXT:    umov w8, v5.h[4]
+; CHECK-GI-FP16-NEXT:    umov w9, v5.h[5]
+; CHECK-GI-FP16-NEXT:    umov w11, v5.h[0]
+; CHECK-GI-FP16-NEXT:    umov w12, v5.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov v16.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], v17.s[0]
+; CHECK-GI-FP16-NEXT:    fmov s3, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v5.h[6]
+; CHECK-GI-FP16-NEXT:    fmov s0, w11
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
+; CHECK-GI-FP16-NEXT:    umov w10, v5.h[3]
+; CHECK-GI-FP16-NEXT:    mov v16.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w9
+; CHECK-GI-FP16-NEXT:    umov w9, v5.h[2]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w12
 ; CHECK-GI-FP16-NEXT:    fmov s5, w4
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #24]
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    add x9, sp, #32
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w1
-; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[1], [x8]
-; CHECK-GI-FP16-NEXT:    add x8, sp, #8
+; CHECK-GI-FP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w9
+; CHECK-GI-FP16-NEXT:    fmov s4, w8
+; CHECK-GI-FP16-NEXT:    mov v2.s[3], w8
 ; CHECK-GI-FP16-NEXT:    mov v5.s[1], w5
-; CHECK-GI-FP16-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-GI-FP16-NEXT:    add x9, sp, #16
-; CHECK-GI-FP16-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[2], [x8]
-; CHECK-GI-FP16-NEXT:    add x8, sp, #40
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w2
-; CHECK-GI-FP16-NEXT:    ld1 { v4.s }[2], [x8]
+; CHECK-GI-FP16-NEXT:    fmov w9, s6
+; CHECK-GI-FP16-NEXT:    fmov s6, w7
+; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT:    mov v0.s[3], w10
+; CHECK-GI-FP16-NEXT:    mov v6.s[1], w9
+; CHECK-GI-FP16-NEXT:    neg v18.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    mov v5.s[2], w6
-; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[3], [x9]
-; CHECK-GI-FP16-NEXT:    mov v2.s[3], w3
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v5.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    bsl v1.16b, v2.16b, v3.16b
-; CHECK-GI-FP16-NEXT:    mov w5, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov w6, v0.s[2]
-; CHECK-GI-FP16-NEXT:    fmov w4, s0
-; CHECK-GI-FP16-NEXT:    mov w1, v1.s[1]
-; CHECK-GI-FP16-NEXT:    mov w2, v1.s[2]
-; CHECK-GI-FP16-NEXT:    mov w3, v1.s[3]
-; CHECK-GI-FP16-NEXT:    fmov w0, s1
+; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s7
+; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    ldr s3, [sp, #16]
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v6.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
+; CHECK-GI-FP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s3
+; CHECK-GI-FP16-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v6.s[3], w8
+; CHECK-GI-FP16-NEXT:    eor v3.16b, v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v16.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-FP16-NEXT:    fmov w0, s0
+; CHECK-GI-FP16-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-FP16-NEXT:    fmov w4, s1
+; CHECK-GI-FP16-NEXT:    fmov w1, s2
+; CHECK-GI-FP16-NEXT:    fmov w2, s3
+; CHECK-GI-FP16-NEXT:    fmov w3, s4
+; CHECK-GI-FP16-NEXT:    fmov w5, s5
+; CHECK-GI-FP16-NEXT:    fmov w6, s6
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <7 x half> %a, %b
@@ -1301,40 +1224,9 @@ define <4 x i32> @v4f16_i32(<4 x half> %a, <4 x half> %b, <4 x i32> %d, <4 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v4f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v4.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -1428,74 +1320,15 @@ define <8 x i32> @v8f16_i32(<8 x half> %a, <8 x half> %b, <8 x i32> %d, <8 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v8f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h1
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w10
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w11
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v6.4h, #0
-; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v16.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v6.4s, v7.4s, v6.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v6.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v8f16_i32:
@@ -1664,143 +1497,24 @@ define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16
 ;
 ; CHECK-GI-NOFP16-LABEL: v16f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w15, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
-; CHECK-GI-NOFP16-NEXT:    cset w16, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    cset w17, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
-; CHECK-GI-NOFP16-NEXT:    cset w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    cset w18, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
-; CHECK-GI-NOFP16-NEXT:    cset w0, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-GI-NOFP16-NEXT:    cset w13, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
-; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
-; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
-; CHECK-GI-NOFP16-NEXT:    cset w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w15
-; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w17
-; CHECK-GI-NOFP16-NEXT:    cset w1, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w16
-; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w18
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v18.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w1
-; CHECK-GI-NOFP16-NEXT:    cset w14, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s1, s3
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w11
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w9
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w14
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w13
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    cset w8, mi
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w12
-; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushll v3.4s, v18.4h, #0
-; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v3.4s, v3.8h
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v17.4s, v19.4s, v17.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v16.4s, v18.4s, v16.4s
+; CHECK-GI-NOFP16-NEXT:    fcmgt v3.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    ldp q0, q1, [sp]
 ; CHECK-GI-NOFP16-NEXT:    ldp q18, q19, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-GI-NOFP16-NEXT:    ushll v2.4s, v16.4h, #0
-; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NOFP16-NEXT:    ldp q16, q17, [sp]
-; CHECK-GI-NOFP16-NEXT:    sshr v3.4s, v3.4s, #31
-; CHECK-GI-NOFP16-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    bit v1.16b, v5.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    mov v2.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    bit v0.16b, v4.16b, v16.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
-; CHECK-GI-NOFP16-NEXT:    sshr v2.4s, v2.4s, #31
-; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v5.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -1808,19 +1522,26 @@ define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    ldp q2, q20, [sp]
 ; CHECK-GI-FP16-NEXT:    ldp q18, q19, [sp, #32]
-; CHECK-GI-FP16-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    sshll v16.4s, v1.4h, #0
-; CHECK-GI-FP16-NEXT:    sshll2 v17.4s, v1.8h, #0
-; CHECK-GI-FP16-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-GI-FP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-FP16-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-FP16-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-FP16-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v3.4s, v3.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v16.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v3.4s, v3.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v17.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    ldp q0, q1, [sp]
+; CHECK-GI-FP16-NEXT:    bit v0.16b, v4.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v2.16b, v3.16b
 ; CHECK-GI-FP16-NEXT:    mov v3.16b, v17.16b
-; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v20.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    mov v2.16b, v16.16b
-; CHECK-GI-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-GI-FP16-NEXT:    bit v1.16b, v5.16b, v16.16b
 ; CHECK-GI-FP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
+; CHECK-GI-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fcmp olt <16 x half> %a, %b



More information about the llvm-commits mailing list